Initial import of valgrind 3.6.0.

commit: ed07e00d438c74b7a23c01bfffde77e3968305e4 [log] [tgz]
author: Jeff Brown <jeffbrown@google.com> Thu Feb 03 17:46:23 2011 -0800
committer: Jeff Brown <jeffbrown@google.com> Mon Feb 07 15:59:37 2011 -0800
tree: 94517892abd5499cffdb7259355f53e6d4912187
parent: 0bdfcf2b8a71090760ac9f4914d4dbe00e5e4398 [diff]
diff --git a/VEX/priv/guest_amd64_defs.h b/VEX/priv/guest_amd64_defs.h
new file mode 100644
index 0000000..42451fa
--- /dev/null
+++ b/VEX/priv/guest_amd64_defs.h

@@ -0,0 +1,480 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                guest_amd64_defs.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+/* Only to be used within the guest-amd64 directory. */
+
+#ifndef __VEX_GUEST_AMD64_DEFS_H
+#define __VEX_GUEST_AMD64_DEFS_H
+
+
+/*---------------------------------------------------------*/
+/*--- amd64 to IR conversion                            ---*/
+/*---------------------------------------------------------*/
+
+/* Convert one amd64 insn to IR.  See the type DisOneInstrFn in
+   bb_to_IR.h. */
+extern
+DisResult disInstr_AMD64 ( IRSB*        irbb,
+                           Bool         put_IP,
+                           Bool         (*resteerOkFn) ( void*, Addr64 ),
+                           Bool         resteerCisOk,
+                           void*        callback_opaque,
+                           UChar*       guest_code,
+                           Long         delta,
+                           Addr64       guest_IP,
+                           VexArch      guest_arch,
+                           VexArchInfo* archinfo,
+                           VexAbiInfo*  abiinfo,
+                           Bool         host_bigendian );
+
+/* Used by the optimiser to specialise calls to helpers. */
+extern
+IRExpr* guest_amd64_spechelper ( HChar*   function_name,
+                                 IRExpr** args,
+                                 IRStmt** precedingStmts,
+                                 Int      n_precedingStmts );
+
+/* Describes to the optimiser which part of the guest state require
+   precise memory exceptions.  This is logically part of the guest
+   state description. */
+extern 
+Bool guest_amd64_state_requires_precise_mem_exns ( Int, Int );
+
+extern
+VexGuestLayout amd64guest_layout;
+
+
+/*---------------------------------------------------------*/
+/*--- amd64 guest helpers                               ---*/
+/*---------------------------------------------------------*/
+
+/* --- CLEAN HELPERS --- */
+
+extern ULong amd64g_calculate_rflags_all ( 
+                ULong cc_op, 
+                ULong cc_dep1, ULong cc_dep2, ULong cc_ndep 
+             );
+
+extern ULong amd64g_calculate_rflags_c ( 
+                ULong cc_op, 
+                ULong cc_dep1, ULong cc_dep2, ULong cc_ndep 
+             );
+
+extern ULong amd64g_calculate_condition ( 
+                ULong/*AMD64Condcode*/ cond, 
+                ULong cc_op, 
+                ULong cc_dep1, ULong cc_dep2, ULong cc_ndep 
+             );
+
+extern ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl );
+
+extern ULong amd64g_calculate_RCR  ( 
+                ULong arg, ULong rot_amt, ULong rflags_in, Long sz 
+             );
+
+extern ULong amd64g_calculate_RCL  ( 
+                ULong arg, ULong rot_amt, ULong rflags_in, Long sz 
+             );
+
+extern ULong amd64g_calculate_pclmul(ULong s1, ULong s2, ULong which);
+
+extern ULong amd64g_check_fldcw ( ULong fpucw );
+
+extern ULong amd64g_create_fpucw ( ULong fpround );
+
+extern ULong amd64g_check_ldmxcsr ( ULong mxcsr );
+
+extern ULong amd64g_create_mxcsr ( ULong sseround );
+
+extern VexEmWarn amd64g_dirtyhelper_FLDENV ( VexGuestAMD64State*, HWord );
+
+extern void amd64g_dirtyhelper_FSTENV ( VexGuestAMD64State*, HWord );
+
+/* Translate a guest virtual_addr into a guest linear address by
+   consulting the supplied LDT/GDT structures.  Their representation
+   must be as specified in pub/libvex_guest_amd64.h.  To indicate a
+   translation failure, 1<<32 is returned.  On success, the lower 32
+   bits of the returned result indicate the linear address.  
+*/
+//extern 
+//ULong amd64g_use_seg_selector ( HWord ldt, HWord gdt, 
+//                              UInt seg_selector, UInt virtual_addr );
+
+extern ULong amd64g_calculate_mmx_pmaddwd  ( ULong, ULong );
+extern ULong amd64g_calculate_mmx_psadbw   ( ULong, ULong );
+extern ULong amd64g_calculate_mmx_pmovmskb ( ULong );
+extern ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo );
+
+
+/* --- DIRTY HELPERS --- */
+
+extern ULong amd64g_dirtyhelper_loadF80le  ( ULong/*addr*/ );
+
+extern void  amd64g_dirtyhelper_storeF80le ( ULong/*addr*/, ULong/*data*/ );
+
+extern void  amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st );
+extern void  amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st );
+extern void  amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st );
+
+extern void  amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* );
+
+extern void  amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State*, HWord );
+
+extern ULong amd64g_dirtyhelper_RDTSC ( void );
+
+extern ULong amd64g_dirtyhelper_IN  ( ULong portno, ULong sz/*1,2 or 4*/ );
+extern void  amd64g_dirtyhelper_OUT ( ULong portno, ULong data, 
+                                      ULong sz/*1,2 or 4*/ );
+
+extern void amd64g_dirtyhelper_SxDT ( void* address,
+                                      ULong op /* 0 or 1 */ );
+
+/* Helps with PCMP{I,E}STR{I,M}.
+
+   CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
+   actually it could be a clean helper, but for the fact that we can't
+   pass by value 2 x V128 to a clean helper, nor have one returned.)
+   Reads guest state, writes to guest state for the xSTRM cases, no
+   accesses of memory, is a pure function.
+
+   opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
+   the callee knows which I/E and I/M variant it is dealing with and
+   what the specific operation is.  4th byte of opcode is in the range
+   0x60 to 0x63:
+       istri  66 0F 3A 63
+       istrm  66 0F 3A 62
+       estri  66 0F 3A 61
+       estrm  66 0F 3A 60
+
+   gstOffL and gstOffR are the guest state offsets for the two XMM
+   register inputs.  We never have to deal with the memory case since
+   that is handled by pre-loading the relevant value into the fake
+   XMM16 register.
+
+   For ESTRx variants, edxIN and eaxIN hold the values of those two
+   registers.
+
+   In all cases, the bottom 16 bits of the result contain the new
+   OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
+   result hold the new %ecx value.  For xSTRM variants, the helper
+   writes the result directly to the guest XMM0.
+
+   Declarable side effects: in all cases, reads guest state at
+   [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
+   guest_XMM0.
+
+   Is expected to be called with opc_and_imm combinations which have
+   actually been validated, and will assert if otherwise.  The front
+   end should ensure we're only called with verified values.
+*/
+extern ULong amd64g_dirtyhelper_PCMPxSTRx ( 
+          VexGuestAMD64State*,
+          HWord opc4_and_imm,
+          HWord gstOffL, HWord gstOffR,
+          HWord edxIN, HWord eaxIN
+       );
+
+
+//extern void  amd64g_dirtyhelper_CPUID_sse0 ( VexGuestAMD64State* );
+//extern void  amd64g_dirtyhelper_CPUID_sse1 ( VexGuestAMD64State* );
+//extern void  amd64g_dirtyhelper_CPUID_sse2 ( VexGuestAMD64State* );
+
+//extern void  amd64g_dirtyhelper_FSAVE ( VexGuestAMD64State*, HWord );
+
+//extern VexEmWarn
+//            amd64g_dirtyhelper_FRSTOR ( VexGuestAMD64State*, HWord );
+
+//extern void amd64g_dirtyhelper_FSTENV ( VexGuestAMD64State*, HWord );
+
+//extern VexEmWarn 
+//            amd64g_dirtyhelper_FLDENV ( VexGuestAMD64State*, HWord );
+
+
+
+/*---------------------------------------------------------*/
+/*--- Condition code stuff                              ---*/
+/*---------------------------------------------------------*/
+
+/* rflags masks */
+#define AMD64G_CC_SHIFT_O   11
+#define AMD64G_CC_SHIFT_S   7
+#define AMD64G_CC_SHIFT_Z   6
+#define AMD64G_CC_SHIFT_A   4
+#define AMD64G_CC_SHIFT_C   0
+#define AMD64G_CC_SHIFT_P   2
+
+#define AMD64G_CC_MASK_O    (1ULL << AMD64G_CC_SHIFT_O)
+#define AMD64G_CC_MASK_S    (1ULL << AMD64G_CC_SHIFT_S)
+#define AMD64G_CC_MASK_Z    (1ULL << AMD64G_CC_SHIFT_Z)
+#define AMD64G_CC_MASK_A    (1ULL << AMD64G_CC_SHIFT_A)
+#define AMD64G_CC_MASK_C    (1ULL << AMD64G_CC_SHIFT_C)
+#define AMD64G_CC_MASK_P    (1ULL << AMD64G_CC_SHIFT_P)
+
+/* FPU flag masks */
+#define AMD64G_FC_SHIFT_C3   14
+#define AMD64G_FC_SHIFT_C2   10
+#define AMD64G_FC_SHIFT_C1   9
+#define AMD64G_FC_SHIFT_C0   8
+
+#define AMD64G_FC_MASK_C3    (1ULL << AMD64G_FC_SHIFT_C3)
+#define AMD64G_FC_MASK_C2    (1ULL << AMD64G_FC_SHIFT_C2)
+#define AMD64G_FC_MASK_C1    (1ULL << AMD64G_FC_SHIFT_C1)
+#define AMD64G_FC_MASK_C0    (1ULL << AMD64G_FC_SHIFT_C0)
+
+
+/* %RFLAGS thunk descriptors.  A four-word thunk is used to record
+   details of the most recent flag-setting operation, so the flags can
+   be computed later if needed.  It is possible to do this a little
+   more efficiently using a 3-word thunk, but that makes it impossible
+   to describe the flag data dependencies sufficiently accurately for
+   Memcheck.  Hence 4 words are used, with minimal loss of efficiency.
+
+   The four words are:
+
+      CC_OP, which describes the operation.
+
+      CC_DEP1 and CC_DEP2.  These are arguments to the operation.
+         We want Memcheck to believe that the resulting flags are
+         data-dependent on both CC_DEP1 and CC_DEP2, hence the 
+         name DEP.
+
+      CC_NDEP.  This is a 3rd argument to the operation which is
+         sometimes needed.  We arrange things so that Memcheck does
+         not believe the resulting flags are data-dependent on CC_NDEP
+         ("not dependent").
+
+   To make Memcheck believe that (the definedness of) the encoded
+   flags depends only on (the definedness of) CC_DEP1 and CC_DEP2
+   requires two things:
+
+   (1) In the guest state layout info (amd64guest_layout), CC_OP and
+       CC_NDEP are marked as always defined.
+
+   (2) When passing the thunk components to an evaluation function
+       (calculate_condition, calculate_eflags, calculate_eflags_c) the
+       IRCallee's mcx_mask must be set so as to exclude from
+       consideration all passed args except CC_DEP1 and CC_DEP2.
+
+   Strictly speaking only (2) is necessary for correctness.  However,
+   (1) helps efficiency in that since (2) means we never ask about the
+   definedness of CC_OP or CC_NDEP, we may as well not even bother to
+   track their definedness.
+
+   When building the thunk, it is always necessary to write words into
+   CC_DEP1 and CC_DEP2, even if those args are not used given the
+   CC_OP field (eg, CC_DEP2 is not used if CC_OP is CC_LOGIC1/2/4).
+   This is important because otherwise Memcheck could give false
+   positives as it does not understand the relationship between the
+   CC_OP field and CC_DEP1 and CC_DEP2, and so believes that the 
+   definedness of the stored flags always depends on both CC_DEP1 and
+   CC_DEP2.
+
+   However, it is only necessary to set CC_NDEP when the CC_OP value
+   requires it, because Memcheck ignores CC_NDEP, and the evaluation
+   functions do understand the CC_OP fields and will only examine
+   CC_NDEP for suitable values of CC_OP.
+
+   A summary of the field usages is:
+
+   Operation          DEP1               DEP2               NDEP
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   add/sub/mul        first arg          second arg         unused
+
+   adc/sbb            first arg          (second arg)
+                                         XOR old_carry      old_carry
+
+   and/or/xor         result             zero               unused
+
+   inc/dec            result             zero               old_carry
+
+   shl/shr/sar        result             subshifted-        unused
+                                         result
+
+   rol/ror            result             zero               old_flags
+
+   copy               old_flags          zero               unused.
+
+
+   Therefore Memcheck will believe the following:
+
+   * add/sub/mul -- definedness of result flags depends on definedness
+     of both args.
+
+   * adc/sbb -- definedness of result flags depends on definedness of
+     both args and definedness of the old C flag.  Because only two
+     DEP fields are available, the old C flag is XOR'd into the second
+     arg so that Memcheck sees the data dependency on it.  That means
+     the NDEP field must contain a second copy of the old C flag
+     so that the evaluation functions can correctly recover the second
+     arg.
+
+   * and/or/xor are straightforward -- definedness of result flags
+     depends on definedness of result value.
+
+   * inc/dec -- definedness of result flags depends only on
+     definedness of result.  This isn't really true -- it also depends
+     on the old C flag.  However, we don't want Memcheck to see that,
+     and so the old C flag must be passed in NDEP and not in DEP2.
+     It's inconceivable that a compiler would generate code that puts
+     the C flag in an undefined state, then does an inc/dec, which
+     leaves C unchanged, and then makes a conditional jump/move based
+     on C.  So our fiction seems a good approximation.
+
+   * shl/shr/sar -- straightforward, again, definedness of result
+     flags depends on definedness of result value.  The subshifted
+     value (value shifted one less) is also needed, but its
+     definedness is the same as the definedness of the shifted value.
+
+   * rol/ror -- these only set O and C, and leave A Z C P alone.
+     However it seems prudent (as per inc/dec) to say the definedness
+     of all resulting flags depends on the definedness of the result,
+     hence the old flags must go in as NDEP and not DEP2.
+
+   * rcl/rcr are too difficult to do in-line, and so are done by a
+     helper function.  They are not part of this scheme.  The helper
+     function takes the value to be rotated, the rotate amount and the
+     old flags, and returns the new flags and the rotated value.
+     Since the helper's mcx_mask does not have any set bits, Memcheck
+     will lazily propagate undefinedness from any of the 3 args into 
+     both results (flags and actual value).
+*/
+enum {
+    AMD64G_CC_OP_COPY=0,  /* DEP1 = current flags, DEP2 = 0, NDEP = unused */
+                          /* just copy DEP1 to output */
+
+    AMD64G_CC_OP_ADDB,    /* 1 */
+    AMD64G_CC_OP_ADDW,    /* 2 DEP1 = argL, DEP2 = argR, NDEP = unused */
+    AMD64G_CC_OP_ADDL,    /* 3 */
+    AMD64G_CC_OP_ADDQ,    /* 4 */
+
+    AMD64G_CC_OP_SUBB,    /* 5 */
+    AMD64G_CC_OP_SUBW,    /* 6 DEP1 = argL, DEP2 = argR, NDEP = unused */
+    AMD64G_CC_OP_SUBL,    /* 7 */
+    AMD64G_CC_OP_SUBQ,    /* 8 */
+
+    AMD64G_CC_OP_ADCB,    /* 9 */
+    AMD64G_CC_OP_ADCW,    /* 10 DEP1 = argL, DEP2 = argR ^ oldCarry, NDEP = oldCarry */
+    AMD64G_CC_OP_ADCL,    /* 11 */
+    AMD64G_CC_OP_ADCQ,    /* 12 */
+
+    AMD64G_CC_OP_SBBB,    /* 13 */
+    AMD64G_CC_OP_SBBW,    /* 14 DEP1 = argL, DEP2 = argR ^ oldCarry, NDEP = oldCarry */
+    AMD64G_CC_OP_SBBL,    /* 15 */
+    AMD64G_CC_OP_SBBQ,    /* 16 */
+
+    AMD64G_CC_OP_LOGICB,  /* 17 */
+    AMD64G_CC_OP_LOGICW,  /* 18 DEP1 = result, DEP2 = 0, NDEP = unused */
+    AMD64G_CC_OP_LOGICL,  /* 19 */
+    AMD64G_CC_OP_LOGICQ,  /* 20 */
+
+    AMD64G_CC_OP_INCB,    /* 21 */
+    AMD64G_CC_OP_INCW,    /* 22 DEP1 = result, DEP2 = 0, NDEP = oldCarry (0 or 1) */
+    AMD64G_CC_OP_INCL,    /* 23 */
+    AMD64G_CC_OP_INCQ,    /* 24 */
+
+    AMD64G_CC_OP_DECB,    /* 25 */
+    AMD64G_CC_OP_DECW,    /* 26 DEP1 = result, DEP2 = 0, NDEP = oldCarry (0 or 1) */
+    AMD64G_CC_OP_DECL,    /* 27 */
+    AMD64G_CC_OP_DECQ,    /* 28 */
+
+    AMD64G_CC_OP_SHLB,    /* 29 DEP1 = res, DEP2 = res', NDEP = unused */
+    AMD64G_CC_OP_SHLW,    /* 30 where res' is like res but shifted one bit less */
+    AMD64G_CC_OP_SHLL,    /* 31 */
+    AMD64G_CC_OP_SHLQ,    /* 32 */
+
+    AMD64G_CC_OP_SHRB,    /* 33 DEP1 = res, DEP2 = res', NDEP = unused */
+    AMD64G_CC_OP_SHRW,    /* 34 where res' is like res but shifted one bit less */
+    AMD64G_CC_OP_SHRL,    /* 35 */
+    AMD64G_CC_OP_SHRQ,    /* 36 */
+
+    AMD64G_CC_OP_ROLB,    /* 37 */
+    AMD64G_CC_OP_ROLW,    /* 38 DEP1 = res, DEP2 = 0, NDEP = old flags */
+    AMD64G_CC_OP_ROLL,    /* 39 */
+    AMD64G_CC_OP_ROLQ,    /* 40 */
+
+    AMD64G_CC_OP_RORB,    /* 41 */
+    AMD64G_CC_OP_RORW,    /* 42 DEP1 = res, DEP2 = 0, NDEP = old flags */
+    AMD64G_CC_OP_RORL,    /* 43 */
+    AMD64G_CC_OP_RORQ,    /* 44 */
+
+    AMD64G_CC_OP_UMULB,   /* 45 */
+    AMD64G_CC_OP_UMULW,   /* 46 DEP1 = argL, DEP2 = argR, NDEP = unused */
+    AMD64G_CC_OP_UMULL,   /* 47 */
+    AMD64G_CC_OP_UMULQ,   /* 48 */
+
+    AMD64G_CC_OP_SMULB,   /* 49 */
+    AMD64G_CC_OP_SMULW,   /* 50 DEP1 = argL, DEP2 = argR, NDEP = unused */
+    AMD64G_CC_OP_SMULL,   /* 51 */
+    AMD64G_CC_OP_SMULQ,   /* 52 */
+
+    AMD64G_CC_OP_NUMBER
+};
+
+typedef
+   enum {
+      AMD64CondO      = 0,  /* overflow           */
+      AMD64CondNO     = 1,  /* no overflow        */
+
+      AMD64CondB      = 2,  /* below              */
+      AMD64CondNB     = 3,  /* not below          */
+
+      AMD64CondZ      = 4,  /* zero               */
+      AMD64CondNZ     = 5,  /* not zero           */
+
+      AMD64CondBE     = 6,  /* below or equal     */
+      AMD64CondNBE    = 7,  /* not below or equal */
+
+      AMD64CondS      = 8,  /* negative           */
+      AMD64CondNS     = 9,  /* not negative       */
+
+      AMD64CondP      = 10, /* parity even        */
+      AMD64CondNP     = 11, /* not parity even    */
+
+      AMD64CondL      = 12, /* jump less          */
+      AMD64CondNL     = 13, /* not less           */
+
+      AMD64CondLE     = 14, /* less or equal      */
+      AMD64CondNLE    = 15, /* not less or equal  */
+
+      AMD64CondAlways = 16  /* HACK */
+   }
+   AMD64Condcode;
+
+#endif /* ndef __VEX_GUEST_AMD64_DEFS_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                  guest_amd64_defs.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/guest_amd64_helpers.c b/VEX/priv/guest_amd64_helpers.c
new file mode 100644
index 0000000..a920ecd
--- /dev/null
+++ b/VEX/priv/guest_amd64_helpers.c

@@ -0,0 +1,2874 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                             guest_amd64_helpers.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_emwarn.h"
+#include "libvex_guest_amd64.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+
+#include "main_util.h"
+#include "guest_generic_bb_to_IR.h"
+#include "guest_amd64_defs.h"
+#include "guest_generic_x87.h"
+
+
+/* This file contains helper functions for amd64 guest code.
+   Calls to these functions are generated by the back end.
+   These calls are of course in the host machine code and 
+   this file will be compiled to host machine code, so that
+   all makes sense.  
+
+   Only change the signatures of these helper functions very
+   carefully.  If you change the signature here, you'll have to change
+   the parameters passed to it in the IR calls constructed by
+   guest-amd64/toIR.c.
+
+   The convention used is that all functions called from generated
+   code are named amd64g_<something>, and any function whose name lacks
+   that prefix is not called from generated code.  Note that some
+   LibVEX_* functions can however be called by VEX's client, but that
+   is not the same as calling them from VEX-generated code.
+*/
+
+
+/* Set to 1 to get detailed profiling info about use of the flag
+   machinery. */
+#define PROFILE_RFLAGS 0
+
+
+/*---------------------------------------------------------------*/
+/*--- %rflags run-time helpers.                               ---*/
+/*---------------------------------------------------------------*/
+
+/* Do 64x64 -> 128 signed/unsigned multiplies, for computing flags
+   after imulq/mulq. */
+
+static void mullS64 ( Long u, Long v, Long* rHi, Long* rLo )
+{
+   ULong u0, v0, w0;
+    Long u1, v1, w1, w2, t;
+   u0   = u & 0xFFFFFFFFULL; 
+   u1   = u >> 32;
+   v0   = v & 0xFFFFFFFFULL;
+   v1   = v >> 32;
+   w0   = u0 * v0;
+   t    = u1 * v0 + (w0 >> 32);
+   w1   = t & 0xFFFFFFFFULL;
+   w2   = t >> 32;
+   w1   = u0 * v1 + w1;
+   *rHi = u1 * v1 + w2 + (w1 >> 32);
+   *rLo = u * v;
+}
+
+static void mullU64 ( ULong u, ULong v, ULong* rHi, ULong* rLo )
+{
+   ULong u0, v0, w0;
+   ULong u1, v1, w1,w2,t;
+   u0   = u & 0xFFFFFFFFULL;
+   u1   = u >> 32;
+   v0   = v & 0xFFFFFFFFULL;
+   v1   = v >> 32;
+   w0   = u0 * v0;
+   t    = u1 * v0 + (w0 >> 32);
+   w1   = t & 0xFFFFFFFFULL;
+   w2   = t >> 32;
+   w1   = u0 * v1 + w1;
+   *rHi = u1 * v1 + w2 + (w1 >> 32);
+   *rLo = u * v;
+}
+
+
+static const UChar parity_table[256] = {
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0,
+    0, AMD64G_CC_MASK_P, AMD64G_CC_MASK_P, 0, AMD64G_CC_MASK_P, 0, 0, AMD64G_CC_MASK_P,
+};
+
+/* generalised left-shifter */
+static inline Long lshift ( Long x, Int n )
+{
+   if (n >= 0)
+      return x << n;
+   else
+      return x >> (-n);
+}
+
+/* identity on ULong */
+static inline ULong idULong ( ULong x )
+{
+   return x;
+}
+
+
+#define PREAMBLE(__data_bits)					\
+   /* const */ ULong DATA_MASK 					\
+      = __data_bits==8                                          \
+           ? 0xFFULL 					        \
+           : (__data_bits==16                                   \
+                ? 0xFFFFULL 		                        \
+                : (__data_bits==32                              \
+                     ? 0xFFFFFFFFULL                            \
+                     : 0xFFFFFFFFFFFFFFFFULL));                 \
+   /* const */ ULong SIGN_MASK = 1ULL << (__data_bits - 1);     \
+   /* const */ ULong CC_DEP1 = cc_dep1_formal;			\
+   /* const */ ULong CC_DEP2 = cc_dep2_formal;			\
+   /* const */ ULong CC_NDEP = cc_ndep_formal;			\
+   /* Four bogus assignments, which hopefully gcc can     */	\
+   /* optimise away, and which stop it complaining about  */	\
+   /* unused variables.                                   */	\
+   SIGN_MASK = SIGN_MASK;					\
+   DATA_MASK = DATA_MASK;					\
+   CC_DEP2 = CC_DEP2;						\
+   CC_NDEP = CC_NDEP;
+
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long cf, pf, af, zf, sf, of;				\
+     Long argL, argR, res;					\
+     argL = CC_DEP1;						\
+     argR = CC_DEP2;						\
+     res  = argL + argR;					\
+     cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
+     pf = parity_table[(UChar)res];				\
+     af = (res ^ argL ^ argR) & 0x10;				\
+     zf = ((DATA_UTYPE)res == 0) << 6;				\
+     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
+     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
+                 12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long cf, pf, af, zf, sf, of;				\
+     Long argL, argR, res;					\
+     argL = CC_DEP1;						\
+     argR = CC_DEP2;						\
+     res  = argL - argR;					\
+     cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
+     pf = parity_table[(UChar)res];				\
+     af = (res ^ argL ^ argR) & 0x10;				\
+     zf = ((DATA_UTYPE)res == 0) << 6;				\
+     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
+     of = lshift((argL ^ argR) & (argL ^ res),	 		\
+                 12 - DATA_BITS) & AMD64G_CC_MASK_O; 		\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long cf, pf, af, zf, sf, of;				\
+     Long argL, argR, oldC, res;		 		\
+     oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
+     argL = CC_DEP1;						\
+     argR = CC_DEP2 ^ oldC;	       				\
+     res  = (argL + argR) + oldC;				\
+     if (oldC)							\
+        cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
+     else							\
+        cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
+     pf = parity_table[(UChar)res];				\
+     af = (res ^ argL ^ argR) & 0x10;				\
+     zf = ((DATA_UTYPE)res == 0) << 6;				\
+     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
+     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
+                  12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long cf, pf, af, zf, sf, of;				\
+     Long argL, argR, oldC, res;	       			\
+     oldC = CC_NDEP & AMD64G_CC_MASK_C;				\
+     argL = CC_DEP1;						\
+     argR = CC_DEP2 ^ oldC;	       				\
+     res  = (argL - argR) - oldC;				\
+     if (oldC)							\
+        cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
+     else							\
+        cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
+     pf = parity_table[(UChar)res];				\
+     af = (res ^ argL ^ argR) & 0x10;				\
+     zf = ((DATA_UTYPE)res == 0) << 6;				\
+     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
+     of = lshift((argL ^ argR) & (argL ^ res), 			\
+                 12 - DATA_BITS) & AMD64G_CC_MASK_O;		\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long cf, pf, af, zf, sf, of;				\
+     cf = 0;							\
+     pf = parity_table[(UChar)CC_DEP1];				\
+     af = 0;							\
+     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
+     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
+     of = 0;							\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long cf, pf, af, zf, sf, of;				\
+     Long argL, argR, res;					\
+     res  = CC_DEP1;						\
+     argL = res - 1;						\
+     argR = 1;							\
+     cf = CC_NDEP & AMD64G_CC_MASK_C;				\
+     pf = parity_table[(UChar)res];				\
+     af = (res ^ argL ^ argR) & 0x10;				\
+     zf = ((DATA_UTYPE)res == 0) << 6;				\
+     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
+     of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long cf, pf, af, zf, sf, of;				\
+     Long argL, argR, res;					\
+     res  = CC_DEP1;						\
+     argL = res + 1;						\
+     argR = 1;							\
+     cf = CC_NDEP & AMD64G_CC_MASK_C;				\
+     pf = parity_table[(UChar)res];				\
+     af = (res ^ argL ^ argR) & 0x10;				\
+     zf = ((DATA_UTYPE)res == 0) << 6;				\
+     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
+     of = ((res & DATA_MASK) 					\
+          == ((ULong)SIGN_MASK - 1)) << 11;			\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long cf, pf, af, zf, sf, of;				\
+     cf = (CC_DEP2 >> (DATA_BITS - 1)) & AMD64G_CC_MASK_C;	\
+     pf = parity_table[(UChar)CC_DEP1];				\
+     af = 0; /* undefined */					\
+     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
+     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
+     /* of is defined if shift count == 1 */			\
+     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
+          & AMD64G_CC_MASK_O;					\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);  					\
+   { Long cf, pf, af, zf, sf, of;				\
+     cf = CC_DEP2 & 1;						\
+     pf = parity_table[(UChar)CC_DEP1];				\
+     af = 0; /* undefined */					\
+     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
+     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
+     /* of is defined if shift count == 1 */			\
+     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
+          & AMD64G_CC_MASK_O;					\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+/* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
+/* DEP1 = result, NDEP = old flags */
+#define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long fl 							\
+        = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
+          | (AMD64G_CC_MASK_C & CC_DEP1)			\
+          | (AMD64G_CC_MASK_O & (lshift(CC_DEP1,  		\
+                                      11-(DATA_BITS-1)) 	\
+                     ^ lshift(CC_DEP1, 11)));			\
+     return fl;							\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+/* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
+/* DEP1 = result, NDEP = old flags */
+#define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Long fl 							\
+        = (CC_NDEP & ~(AMD64G_CC_MASK_O | AMD64G_CC_MASK_C))	\
+          | (AMD64G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
+          | (AMD64G_CC_MASK_O & (lshift(CC_DEP1, 		\
+                                      11-(DATA_BITS-1)) 	\
+                     ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
+     return fl;							\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
+                                DATA_U2TYPE, NARROWto2U)        \
+{                                                               \
+   PREAMBLE(DATA_BITS);                                         \
+   { Long cf, pf, af, zf, sf, of;                               \
+     DATA_UTYPE  hi;                                            \
+     DATA_UTYPE  lo                                             \
+        = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
+                     * ((DATA_UTYPE)CC_DEP2) );                 \
+     DATA_U2TYPE rr                                             \
+        = NARROWto2U(                                           \
+             ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
+             * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
+     hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
+     cf = (hi != 0);                                            \
+     pf = parity_table[(UChar)lo];                              \
+     af = 0; /* undefined */                                    \
+     zf = (lo == 0) << 6;                                       \
+     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
+     of = cf << 11;                                             \
+     return cf | pf | af | zf | sf | of;                        \
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
+                                DATA_S2TYPE, NARROWto2S)        \
+{                                                               \
+   PREAMBLE(DATA_BITS);                                         \
+   { Long cf, pf, af, zf, sf, of;                               \
+     DATA_STYPE  hi;                                            \
+     DATA_STYPE  lo                                             \
+        = NARROWtoS( ((DATA_STYPE)CC_DEP1)                      \
+                     * ((DATA_STYPE)CC_DEP2) );                 \
+     DATA_S2TYPE rr                                             \
+        = NARROWto2S(                                           \
+             ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
+             * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
+     hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
+     cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
+     pf = parity_table[(UChar)lo];                              \
+     af = 0; /* undefined */                                    \
+     zf = (lo == 0) << 6;                                       \
+     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
+     of = cf << 11;                                             \
+     return cf | pf | af | zf | sf | of;                        \
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_UMULQ                                           \
+{                                                               \
+   PREAMBLE(64);                                                \
+   { Long cf, pf, af, zf, sf, of;                               \
+     ULong lo, hi;                                              \
+     mullU64( (ULong)CC_DEP1, (ULong)CC_DEP2, &hi, &lo );       \
+     cf = (hi != 0);                                            \
+     pf = parity_table[(UChar)lo];                              \
+     af = 0; /* undefined */                                    \
+     zf = (lo == 0) << 6;                                       \
+     sf = lshift(lo, 8 - 64) & 0x80;                            \
+     of = cf << 11;                                             \
+     return cf | pf | af | zf | sf | of;                        \
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_SMULQ                                           \
+{                                                               \
+   PREAMBLE(64);                                                \
+   { Long cf, pf, af, zf, sf, of;                               \
+     Long lo, hi;                                               \
+     mullS64( (Long)CC_DEP1, (Long)CC_DEP2, &hi, &lo );         \
+     cf = (hi != (lo >>/*s*/ (64-1)));                          \
+     pf = parity_table[(UChar)lo];                              \
+     af = 0; /* undefined */                                    \
+     zf = (lo == 0) << 6;                                       \
+     sf = lshift(lo, 8 - 64) & 0x80;                            \
+     of = cf << 11;                                             \
+     return cf | pf | af | zf | sf | of;                        \
+   }								\
+}
+
+
+#if PROFILE_RFLAGS
+
+static Bool initted     = False;
+
+/* C flag, fast route */
+static UInt tabc_fast[AMD64G_CC_OP_NUMBER];
+/* C flag, slow route */
+static UInt tabc_slow[AMD64G_CC_OP_NUMBER];
+/* table for calculate_cond */
+static UInt tab_cond[AMD64G_CC_OP_NUMBER][16];
+/* total entry counts for calc_all, calc_c, calc_cond. */
+static UInt n_calc_all  = 0;
+static UInt n_calc_c    = 0;
+static UInt n_calc_cond = 0;
+
+#define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
+
+
+static void showCounts ( void )
+{
+   Int op, co;
+   Char ch;
+   vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
+              n_calc_all, n_calc_cond, n_calc_c);
+
+   vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
+              "    S   NS    P   NP    L   NL   LE  NLE\n");
+   vex_printf("     -----------------------------------------------------"
+              "----------------------------------------\n");
+   for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
+
+      ch = ' ';
+      if (op > 0 && (op-1) % 4 == 0) 
+         ch = 'B';
+      if (op > 0 && (op-1) % 4 == 1) 
+         ch = 'W';
+      if (op > 0 && (op-1) % 4 == 2) 
+         ch = 'L';
+      if (op > 0 && (op-1) % 4 == 3) 
+         ch = 'Q';
+
+      vex_printf("%2d%c: ", op, ch);
+      vex_printf("%6u ", tabc_slow[op]);
+      vex_printf("%6u ", tabc_fast[op]);
+      for (co = 0; co < 16; co++) {
+         Int n = tab_cond[op][co];
+         if (n >= 1000) {
+            vex_printf(" %3dK", n / 1000);
+         } else 
+         if (n >= 0) {
+            vex_printf(" %3d ", n );
+         } else {
+            vex_printf("     ");
+         }
+      }
+      vex_printf("\n");
+   }
+   vex_printf("\n");
+}
+
+static void initCounts ( void )
+{
+   Int op, co;
+   initted = True;
+   for (op = 0; op < AMD64G_CC_OP_NUMBER; op++) {
+      tabc_fast[op] = tabc_slow[op] = 0;
+      for (co = 0; co < 16; co++)
+         tab_cond[op][co] = 0;
+   }
+}
+
+#endif /* PROFILE_RFLAGS */
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate all the 6 flags from the supplied thunk parameters.
+   Worker function, not directly called from generated code. */
+static
+ULong amd64g_calculate_rflags_all_WRK ( ULong cc_op, 
+                                        ULong cc_dep1_formal, 
+                                        ULong cc_dep2_formal,
+                                        ULong cc_ndep_formal )
+{
+   switch (cc_op) {
+      case AMD64G_CC_OP_COPY:
+         return cc_dep1_formal
+                & (AMD64G_CC_MASK_O | AMD64G_CC_MASK_S | AMD64G_CC_MASK_Z 
+                   | AMD64G_CC_MASK_A | AMD64G_CC_MASK_C | AMD64G_CC_MASK_P);
+
+      case AMD64G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
+      case AMD64G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
+      case AMD64G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
+      case AMD64G_CC_OP_ADDQ:   ACTIONS_ADD( 64, ULong  );
+
+      case AMD64G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
+      case AMD64G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
+      case AMD64G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
+      case AMD64G_CC_OP_ADCQ:   ACTIONS_ADC( 64, ULong  );
+
+      case AMD64G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
+      case AMD64G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
+      case AMD64G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
+      case AMD64G_CC_OP_SUBQ:   ACTIONS_SUB( 64, ULong  );
+
+      case AMD64G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
+      case AMD64G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
+      case AMD64G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
+      case AMD64G_CC_OP_SBBQ:   ACTIONS_SBB( 64, ULong  );
+
+      case AMD64G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
+      case AMD64G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
+      case AMD64G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
+      case AMD64G_CC_OP_LOGICQ: ACTIONS_LOGIC( 64, ULong  );
+
+      case AMD64G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
+      case AMD64G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
+      case AMD64G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
+      case AMD64G_CC_OP_INCQ:   ACTIONS_INC( 64, ULong  );
+
+      case AMD64G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
+      case AMD64G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
+      case AMD64G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
+      case AMD64G_CC_OP_DECQ:   ACTIONS_DEC( 64, ULong  );
+
+      case AMD64G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
+      case AMD64G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
+      case AMD64G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
+      case AMD64G_CC_OP_SHLQ:   ACTIONS_SHL( 64, ULong  );
+
+      case AMD64G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
+      case AMD64G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
+      case AMD64G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
+      case AMD64G_CC_OP_SHRQ:   ACTIONS_SHR( 64, ULong  );
+
+      case AMD64G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
+      case AMD64G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
+      case AMD64G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
+      case AMD64G_CC_OP_ROLQ:   ACTIONS_ROL( 64, ULong  );
+
+      case AMD64G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
+      case AMD64G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
+      case AMD64G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
+      case AMD64G_CC_OP_RORQ:   ACTIONS_ROR( 64, ULong  );
+
+      case AMD64G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
+                                                  UShort, toUShort );
+      case AMD64G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
+                                                  UInt,   toUInt );
+      case AMD64G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
+                                                  ULong,  idULong );
+
+      case AMD64G_CC_OP_UMULQ:  ACTIONS_UMULQ;
+
+      case AMD64G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
+                                                  Short,  toUShort );
+      case AMD64G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort, 
+                                                  Int,    toUInt   );
+      case AMD64G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
+                                                  Long,   idULong );
+
+      case AMD64G_CC_OP_SMULQ:  ACTIONS_SMULQ;
+
+      default:
+         /* shouldn't really make these calls from generated code */
+         vex_printf("amd64g_calculate_rflags_all_WRK(AMD64)"
+                    "( %llu, 0x%llx, 0x%llx, 0x%llx )\n",
+                    cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
+         vpanic("amd64g_calculate_rflags_all_WRK(AMD64)");
+   }
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate all the 6 flags from the supplied thunk parameters. */
+ULong amd64g_calculate_rflags_all ( ULong cc_op, 
+                                    ULong cc_dep1, 
+                                    ULong cc_dep2,
+                                    ULong cc_ndep )
+{
+#  if PROFILE_RFLAGS
+   if (!initted) initCounts();
+   n_calc_all++;
+   if (SHOW_COUNTS_NOW) showCounts();
+#  endif
+   return
+      amd64g_calculate_rflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate just the carry flag from the supplied thunk parameters. */
+ULong amd64g_calculate_rflags_c ( ULong cc_op, 
+                                  ULong cc_dep1, 
+                                  ULong cc_dep2,
+                                  ULong cc_ndep )
+{
+#  if PROFILE_RFLAGS
+   if (!initted) initCounts();
+   n_calc_c++;
+   tabc_fast[cc_op]++;
+   if (SHOW_COUNTS_NOW) showCounts();
+#  endif
+
+   /* Fast-case some common ones. */
+   switch (cc_op) {
+      case AMD64G_CC_OP_COPY:
+         return (cc_dep1 >> AMD64G_CC_SHIFT_C) & 1;
+      case AMD64G_CC_OP_LOGICQ: 
+      case AMD64G_CC_OP_LOGICL: 
+      case AMD64G_CC_OP_LOGICW: 
+      case AMD64G_CC_OP_LOGICB:
+         return 0;
+	 //      case AMD64G_CC_OP_SUBL:
+	 //         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
+	 //                   ? AMD64G_CC_MASK_C : 0;
+	 //      case AMD64G_CC_OP_SUBW:
+	 //         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
+	 //                   ? AMD64G_CC_MASK_C : 0;
+	 //      case AMD64G_CC_OP_SUBB:
+	 //         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
+	 //                   ? AMD64G_CC_MASK_C : 0;
+	 //      case AMD64G_CC_OP_INCL:
+	 //      case AMD64G_CC_OP_DECL:
+	 //         return cc_ndep & AMD64G_CC_MASK_C;
+      default: 
+         break;
+   }
+
+#  if PROFILE_RFLAGS
+   tabc_fast[cc_op]--;
+   tabc_slow[cc_op]++;
+#  endif
+
+   return amd64g_calculate_rflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep) 
+          & AMD64G_CC_MASK_C;
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* returns 1 or 0 */
+ULong amd64g_calculate_condition ( ULong/*AMD64Condcode*/ cond, 
+                                   ULong cc_op, 
+                                   ULong cc_dep1, 
+                                   ULong cc_dep2,
+                                   ULong cc_ndep )
+{
+   ULong rflags = amd64g_calculate_rflags_all_WRK(cc_op, cc_dep1, 
+                                                  cc_dep2, cc_ndep);
+   ULong of,sf,zf,cf,pf;
+   ULong inv = cond & 1;
+
+#  if PROFILE_RFLAGS
+   if (!initted) initCounts();
+   tab_cond[cc_op][cond]++;
+   n_calc_cond++;
+   if (SHOW_COUNTS_NOW) showCounts();
+#  endif
+
+   switch (cond) {
+      case AMD64CondNO:
+      case AMD64CondO: /* OF == 1 */
+         of = rflags >> AMD64G_CC_SHIFT_O;
+         return 1 & (inv ^ of);
+
+      case AMD64CondNZ:
+      case AMD64CondZ: /* ZF == 1 */
+         zf = rflags >> AMD64G_CC_SHIFT_Z;
+         return 1 & (inv ^ zf);
+
+      case AMD64CondNB:
+      case AMD64CondB: /* CF == 1 */
+         cf = rflags >> AMD64G_CC_SHIFT_C;
+         return 1 & (inv ^ cf);
+         break;
+
+      case AMD64CondNBE:
+      case AMD64CondBE: /* (CF or ZF) == 1 */
+         cf = rflags >> AMD64G_CC_SHIFT_C;
+         zf = rflags >> AMD64G_CC_SHIFT_Z;
+         return 1 & (inv ^ (cf | zf));
+         break;
+
+      case AMD64CondNS:
+      case AMD64CondS: /* SF == 1 */
+         sf = rflags >> AMD64G_CC_SHIFT_S;
+         return 1 & (inv ^ sf);
+
+      case AMD64CondNP:
+      case AMD64CondP: /* PF == 1 */
+         pf = rflags >> AMD64G_CC_SHIFT_P;
+         return 1 & (inv ^ pf);
+
+      case AMD64CondNL:
+      case AMD64CondL: /* (SF xor OF) == 1 */
+         sf = rflags >> AMD64G_CC_SHIFT_S;
+         of = rflags >> AMD64G_CC_SHIFT_O;
+         return 1 & (inv ^ (sf ^ of));
+         break;
+
+      case AMD64CondNLE:
+      case AMD64CondLE: /* ((SF xor OF) or ZF)  == 1 */
+         sf = rflags >> AMD64G_CC_SHIFT_S;
+         of = rflags >> AMD64G_CC_SHIFT_O;
+         zf = rflags >> AMD64G_CC_SHIFT_Z;
+         return 1 & (inv ^ ((sf ^ of) | zf));
+         break;
+
+      default:
+         /* shouldn't really make these calls from generated code */
+         vex_printf("amd64g_calculate_condition"
+                    "( %llu, %llu, 0x%llx, 0x%llx, 0x%llx )\n",
+                    cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
+         vpanic("amd64g_calculate_condition");
+   }
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+ULong LibVEX_GuestAMD64_get_rflags ( /*IN*/VexGuestAMD64State* vex_state )
+{
+   ULong rflags = amd64g_calculate_rflags_all_WRK(
+                     vex_state->guest_CC_OP,
+                     vex_state->guest_CC_DEP1,
+                     vex_state->guest_CC_DEP2,
+                     vex_state->guest_CC_NDEP
+                  );
+   Long dflag = vex_state->guest_DFLAG;
+   vassert(dflag == 1 || dflag == -1);
+   if (dflag == -1)
+      rflags |= (1<<10);
+   if (vex_state->guest_IDFLAG == 1)
+      rflags |= (1<<21);
+   if (vex_state->guest_ACFLAG == 1)
+      rflags |= (1<<18);
+
+   return rflags;
+}
+
+/* VISIBLE TO LIBVEX CLIENT */
+void
+LibVEX_GuestAMD64_put_rflag_c ( ULong new_carry_flag,
+                               /*MOD*/VexGuestAMD64State* vex_state )
+{
+   ULong oszacp = amd64g_calculate_rflags_all_WRK(
+                     vex_state->guest_CC_OP,
+                     vex_state->guest_CC_DEP1,
+                     vex_state->guest_CC_DEP2,
+                     vex_state->guest_CC_NDEP
+                  );
+   if (new_carry_flag & 1) {
+      oszacp |= AMD64G_CC_MASK_C;
+   } else {
+      oszacp &= ~AMD64G_CC_MASK_C;
+   }
+   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
+   vex_state->guest_CC_DEP1 = oszacp;
+   vex_state->guest_CC_DEP2 = 0;
+   vex_state->guest_CC_NDEP = 0;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- %rflags translation-time function specialisers.         ---*/
+/*--- These help iropt specialise calls the above run-time    ---*/
+/*--- %rflags functions.                                      ---*/
+/*---------------------------------------------------------------*/
+
+/* Used by the optimiser to try specialisations.  Returns an
+   equivalent expression, or NULL if none. */
+
+static Bool isU64 ( IRExpr* e, ULong n )
+{
+   return toBool( e->tag == Iex_Const
+                  && e->Iex.Const.con->tag == Ico_U64
+                  && e->Iex.Const.con->Ico.U64 == n );
+}
+
+IRExpr* guest_amd64_spechelper ( HChar* function_name,
+                                 IRExpr** args,
+                                 IRStmt** precedingStmts,
+                                 Int      n_precedingStmts )
+{
+#  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
+#  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
+#  define mkU64(_n) IRExpr_Const(IRConst_U64(_n))
+#  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
+
+   Int i, arity = 0;
+   for (i = 0; args[i]; i++)
+      arity++;
+#  if 0
+   vex_printf("spec request:\n");
+   vex_printf("   %s  ", function_name);
+   for (i = 0; i < arity; i++) {
+      vex_printf("  ");
+      ppIRExpr(args[i]);
+   }
+   vex_printf("\n");
+#  endif
+
+   /* --------- specialising "amd64g_calculate_condition" --------- */
+
+   if (vex_streq(function_name, "amd64g_calculate_condition")) {
+      /* specialise calls to above "calculate condition" function */
+      IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
+      vassert(arity == 5);
+      cond    = args[0];
+      cc_op   = args[1];
+      cc_dep1 = args[2];
+      cc_dep2 = args[3];
+
+      /*---------------- ADDQ ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_ADDQ) && isU64(cond, AMD64CondZ)) {
+         /* long long add, then Z --> test (dst+src == 0) */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpEQ64, 
+                           binop(Iop_Add64, cc_dep1, cc_dep2),
+                           mkU64(0)));
+      }
+
+      /*---------------- SUBQ ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondZ)) {
+         /* long long sub/cmp, then Z --> test dst==src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpEQ64,cc_dep1,cc_dep2));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNZ)) {
+         /* long long sub/cmp, then NZ --> test dst!=src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpNE64,cc_dep1,cc_dep2));
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondL)) {
+         /* long long sub/cmp, then L (signed less than) 
+            --> test dst <s src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64S, cc_dep1, cc_dep2));
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondB)) {
+         /* long long sub/cmp, then B (unsigned less than)
+            --> test dst <u src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64U, cc_dep1, cc_dep2));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondNB)) {
+         /* long long sub/cmp, then NB (unsigned greater than or equal)
+            --> test src <=u dst */
+         /* Note, args are opposite way round from the usual */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE64U, cc_dep2, cc_dep1));
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBQ) && isU64(cond, AMD64CondBE)) {
+         /* long long sub/cmp, then BE (unsigned less than or equal)
+            --> test dst <=u src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE64U, cc_dep1, cc_dep2));
+      }
+
+      /*---------------- SUBL ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondZ)) {
+         /* long sub/cmp, then Z --> test dst==src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpEQ64, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
+                           binop(Iop_Shl64,cc_dep2,mkU8(32))));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNZ)) {
+         /* long sub/cmp, then NZ --> test dst!=src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpNE64, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
+                           binop(Iop_Shl64,cc_dep2,mkU8(32))));
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondL)) {
+         /* long sub/cmp, then L (signed less than) 
+            --> test dst <s src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64S, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
+                           binop(Iop_Shl64,cc_dep2,mkU8(32))));
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondLE)) {
+         /* long sub/cmp, then LE (signed less than or equal) 
+            --> test dst <=s src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE64S, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
+                           binop(Iop_Shl64,cc_dep2,mkU8(32))));
+
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNLE)) {
+         /* long sub/cmp, then NLE (signed greater than) 
+            --> test !(dst <=s src)
+            --> test (dst >s src)
+            --> test (src <s dst) */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64S,
+                           binop(Iop_Shl64,cc_dep2,mkU8(32)),
+                           binop(Iop_Shl64,cc_dep1,mkU8(32))));
+
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondBE)) {
+         /* long sub/cmp, then BE (unsigned less than or equal)
+            --> test dst <=u src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE64U, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
+                           binop(Iop_Shl64,cc_dep2,mkU8(32))));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondNBE)) {
+         /* long sub/cmp, then NBE (unsigned greater than)
+            --> test src <u dst */
+         /* Note, args are opposite way round from the usual */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64U, 
+                           binop(Iop_Shl64,cc_dep2,mkU8(32)),
+                           binop(Iop_Shl64,cc_dep1,mkU8(32))));
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBL) && isU64(cond, AMD64CondS)) {
+         /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64S,
+                           binop(Iop_Sub64,
+                                 binop(Iop_Shl64, cc_dep1, mkU8(32)), 
+                                 binop(Iop_Shl64, cc_dep2, mkU8(32))),
+                           mkU64(0)));
+      }
+
+      /*---------------- SUBW ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondZ)) {
+         /* word sub/cmp, then Z --> test dst==src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpEQ16, 
+                           unop(Iop_64to16,cc_dep1),
+                           unop(Iop_64to16,cc_dep2)));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondNZ)) {
+         /* word sub/cmp, then NZ --> test dst!=src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpNE16, 
+                           unop(Iop_64to16,cc_dep1),
+                           unop(Iop_64to16,cc_dep2)));
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBW) && isU64(cond, AMD64CondLE)) {
+         /* word sub/cmp, then LE (signed less than or equal) 
+            --> test dst <=s src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE64S, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(48)),
+                           binop(Iop_Shl64,cc_dep2,mkU8(48))));
+
+      }
+
+      /*---------------- SUBB ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondZ)) {
+         /* byte sub/cmp, then Z --> test dst==src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpEQ8, 
+                           unop(Iop_64to8,cc_dep1),
+                           unop(Iop_64to8,cc_dep2)));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNZ)) {
+         /* byte sub/cmp, then NZ --> test dst!=src */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpNE8, 
+                           unop(Iop_64to8,cc_dep1),
+                           unop(Iop_64to8,cc_dep2)));
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondS)
+                                          && isU64(cc_dep2, 0)) {
+         /* byte sub/cmp of zero, then S --> test (dst-0 <s 0)
+                                         --> test dst <s 0
+                                         --> (ULong)dst[7]
+            This is yet another scheme by which gcc figures out if the
+            top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
+         /* Note: isU64(cc_dep2, 0) is correct, even though this is
+            for an 8-bit comparison, since the args to the helper
+            function are always U64s. */
+         return binop(Iop_And64,
+                      binop(Iop_Shr64,cc_dep1,mkU8(7)),
+                      mkU64(1));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_SUBB) && isU64(cond, AMD64CondNS)
+                                          && isU64(cc_dep2, 0)) {
+         /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0)
+                                          --> test !(dst <s 0)
+                                          --> (ULong) !dst[7]
+         */
+         return binop(Iop_Xor64,
+                      binop(Iop_And64,
+                            binop(Iop_Shr64,cc_dep1,mkU8(7)),
+                            mkU64(1)),
+                      mkU64(1));
+      }
+
+      /*---------------- LOGICQ ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondZ)) {
+         /* long long and/or/xor, then Z --> test dst==0 */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpEQ64, cc_dep1, mkU64(0)));
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ) && isU64(cond, AMD64CondL)) {
+         /* long long and/or/xor, then L
+            LOGIC sets SF and ZF according to the
+            result and makes OF be zero.  L computes SF ^ OF, but
+            OF is zero, so this reduces to SF -- which will be 1 iff
+            the result is < signed 0.  Hence ...
+         */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64S, 
+                           cc_dep1, 
+                           mkU64(0)));
+      }
+
+      /*---------------- LOGICL ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondZ)) {
+         /* long and/or/xor, then Z --> test dst==0 */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpEQ64, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)), 
+                           mkU64(0)));
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondNZ)) {
+         /* long and/or/xor, then NZ --> test dst!=0 */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpNE64, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)), 
+                           mkU64(0)));
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_LOGICL) && isU64(cond, AMD64CondLE)) {
+         /* long and/or/xor, then LE
+            This is pretty subtle.  LOGIC sets SF and ZF according to the
+            result and makes OF be zero.  LE computes (SF ^ OF) | ZF, but
+            OF is zero, so this reduces to SF | ZF -- which will be 1 iff
+            the result is <=signed 0.  Hence ...
+         */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLE64S, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)), 
+                           mkU64(0)));
+      }
+
+      /*---------------- LOGICB ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondZ)) {
+         /* byte and/or/xor, then Z --> test dst==0 */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpEQ64, binop(Iop_And64,cc_dep1,mkU64(255)), 
+                                        mkU64(0)));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondNZ)) {
+         /* byte and/or/xor, then NZ --> test dst!=0 */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpNE64, binop(Iop_And64,cc_dep1,mkU64(255)), 
+                                        mkU64(0)));
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_LOGICB) && isU64(cond, AMD64CondS)) {
+         /* this is an idiom gcc sometimes uses to find out if the top
+            bit of a byte register is set: eg testb %al,%al; js ..
+            Since it just depends on the top bit of the byte, extract
+            that bit and explicitly get rid of all the rest.  This
+            helps memcheck avoid false positives in the case where any
+            of the other bits in the byte are undefined. */
+         /* byte and/or/xor, then S --> (UInt)result[7] */
+         return binop(Iop_And64,
+                      binop(Iop_Shr64,cc_dep1,mkU8(7)),
+                      mkU64(1));
+      }
+
+      /*---------------- INCB ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_INCB) && isU64(cond, AMD64CondLE)) {
+         /* 8-bit inc, then LE --> sign bit of the arg */
+         return binop(Iop_And64,
+                      binop(Iop_Shr64,
+                            binop(Iop_Sub64, cc_dep1, mkU64(1)),
+                            mkU8(7)),
+                      mkU64(1));
+      }
+
+      /*---------------- INCW ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_INCW) && isU64(cond, AMD64CondZ)) {
+         /* 16-bit inc, then Z --> test dst == 0 */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpEQ64, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(48)), 
+                           mkU64(0)));
+      }
+
+      /*---------------- DECL ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_DECL) && isU64(cond, AMD64CondZ)) {
+         /* dec L, then Z --> test dst == 0 */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpEQ64,
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)),
+                           mkU64(0)));
+      }
+
+      /*---------------- DECW ----------------*/
+
+      if (isU64(cc_op, AMD64G_CC_OP_DECW) && isU64(cond, AMD64CondNZ)) {
+         /* 16-bit dec, then NZ --> test dst != 0 */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpNE64, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(48)), 
+                           mkU64(0)));
+      }
+
+      /*---------------- COPY ----------------*/
+      /* This can happen, as a result of amd64 FP compares: "comisd ... ;
+         jbe" for example. */
+
+      if (isU64(cc_op, AMD64G_CC_OP_COPY) && 
+          (isU64(cond, AMD64CondBE) || isU64(cond, AMD64CondNBE))) {
+         /* COPY, then BE --> extract C and Z from dep1, and test (C
+            or Z == 1). */
+         /* COPY, then NBE --> extract C and Z from dep1, and test (C
+            or Z == 0). */
+         ULong nnn = isU64(cond, AMD64CondBE) ? 1 : 0;
+         return
+            unop(
+               Iop_1Uto64,
+               binop(
+                  Iop_CmpEQ64,
+                  binop(
+                     Iop_And64,
+                     binop(
+                        Iop_Or64,
+                        binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
+                        binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z))
+                     ),
+                     mkU64(1)
+                  ),
+                  mkU64(nnn)
+               )
+            );
+      }
+      
+      if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondB)) {
+         /* COPY, then B --> extract C dep1, and test (C == 1). */
+         return
+            unop(
+               Iop_1Uto64,
+               binop(
+                  Iop_CmpNE64,
+                  binop(
+                     Iop_And64,
+                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_C)),
+                     mkU64(1)
+                  ),
+                  mkU64(0)
+               )
+            );
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_COPY) 
+          && (isU64(cond, AMD64CondZ) || isU64(cond, AMD64CondNZ))) {
+         /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
+         /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
+         UInt nnn = isU64(cond, AMD64CondZ) ? 1 : 0;
+         return
+            unop(
+               Iop_1Uto64,
+               binop(
+                  Iop_CmpEQ64,
+                  binop(
+                     Iop_And64,
+                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_Z)),
+                     mkU64(1)
+                  ),
+                  mkU64(nnn)
+               )
+            );
+      }
+
+      if (isU64(cc_op, AMD64G_CC_OP_COPY) && isU64(cond, AMD64CondP)) {
+         /* COPY, then P --> extract P from dep1, and test (P == 1). */
+         return
+            unop(
+               Iop_1Uto64,
+               binop(
+                  Iop_CmpNE64,
+                  binop(
+                     Iop_And64,
+                     binop(Iop_Shr64, cc_dep1, mkU8(AMD64G_CC_SHIFT_P)),
+                     mkU64(1)
+                  ),
+                  mkU64(0)
+               )
+            );
+      }
+
+      return NULL;
+   }
+
+   /* --------- specialising "amd64g_calculate_rflags_c" --------- */
+
+   if (vex_streq(function_name, "amd64g_calculate_rflags_c")) {
+      /* specialise calls to above "calculate_rflags_c" function */
+      IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
+      vassert(arity == 4);
+      cc_op   = args[0];
+      cc_dep1 = args[1];
+      cc_dep2 = args[2];
+      cc_ndep = args[3];
+
+      if (isU64(cc_op, AMD64G_CC_OP_SUBQ)) {
+         /* C after sub denotes unsigned less than */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64U, 
+                           cc_dep1,
+                           cc_dep2));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_SUBL)) {
+         /* C after sub denotes unsigned less than */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64U, 
+                           binop(Iop_Shl64,cc_dep1,mkU8(32)), 
+                           binop(Iop_Shl64,cc_dep2,mkU8(32))));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_SUBB)) {
+         /* C after sub denotes unsigned less than */
+         return unop(Iop_1Uto64,
+                     binop(Iop_CmpLT64U, 
+                           binop(Iop_And64,cc_dep1,mkU64(0xFF)),
+                           binop(Iop_And64,cc_dep2,mkU64(0xFF))));
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_LOGICQ)
+          || isU64(cc_op, AMD64G_CC_OP_LOGICL)
+          || isU64(cc_op, AMD64G_CC_OP_LOGICW)
+          || isU64(cc_op, AMD64G_CC_OP_LOGICB)) {
+         /* cflag after logic is zero */
+         return mkU64(0);
+      }
+      if (isU64(cc_op, AMD64G_CC_OP_DECL) || isU64(cc_op, AMD64G_CC_OP_INCL)
+          || isU64(cc_op, AMD64G_CC_OP_DECQ) || isU64(cc_op, AMD64G_CC_OP_INCQ)) {
+         /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
+         return cc_ndep;
+      }
+
+#     if 0
+      if (cc_op->tag == Iex_Const) {
+         vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
+      }
+#     endif
+
+      return NULL;
+   }
+
+#  undef unop
+#  undef binop
+#  undef mkU64
+#  undef mkU8
+
+   return NULL;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Supporting functions for x87 FPU activities.            ---*/
+/*---------------------------------------------------------------*/
+
+static inline Bool host_is_little_endian ( void )
+{
+   UInt x = 0x76543210;
+   UChar* p = (UChar*)(&x);
+   return toBool(*p == 0x10);
+}
+
+/* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong amd64g_calculate_FXAM ( ULong tag, ULong dbl ) 
+{
+   Bool   mantissaIsZero;
+   Int    bexp;
+   UChar  sign;
+   UChar* f64;
+
+   vassert(host_is_little_endian());
+
+   /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
+
+   f64  = (UChar*)(&dbl);
+   sign = toUChar( (f64[7] >> 7) & 1 );
+
+   /* First off, if the tag indicates the register was empty,
+      return 1,0,sign,1 */
+   if (tag == 0) {
+      /* vex_printf("Empty\n"); */
+      return AMD64G_FC_MASK_C3 | 0 | (sign << AMD64G_FC_SHIFT_C1) 
+                                   | AMD64G_FC_MASK_C0;
+   }
+
+   bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
+   bexp &= 0x7FF;
+
+   mantissaIsZero
+      = toBool(
+           (f64[6] & 0x0F) == 0 
+           && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
+        );
+
+   /* If both exponent and mantissa are zero, the value is zero.
+      Return 1,0,sign,0. */
+   if (bexp == 0 && mantissaIsZero) {
+      /* vex_printf("Zero\n"); */
+      return AMD64G_FC_MASK_C3 | 0 
+                               | (sign << AMD64G_FC_SHIFT_C1) | 0;
+   }
+   
+   /* If exponent is zero but mantissa isn't, it's a denormal.
+      Return 1,1,sign,0. */
+   if (bexp == 0 && !mantissaIsZero) {
+      /* vex_printf("Denormal\n"); */
+      return AMD64G_FC_MASK_C3 | AMD64G_FC_MASK_C2 
+                               | (sign << AMD64G_FC_SHIFT_C1) | 0;
+   }
+
+   /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
+      Return 0,1,sign,1. */
+   if (bexp == 0x7FF && mantissaIsZero) {
+      /* vex_printf("Inf\n"); */
+      return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) 
+                                   | AMD64G_FC_MASK_C0;
+   }
+
+   /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
+      Return 0,0,sign,1. */
+   if (bexp == 0x7FF && !mantissaIsZero) {
+      /* vex_printf("NaN\n"); */
+      return 0 | 0 | (sign << AMD64G_FC_SHIFT_C1) | AMD64G_FC_MASK_C0;
+   }
+
+   /* Uh, ok, we give up.  It must be a normal finite number.
+      Return 0,1,sign,0.
+   */
+   /* vex_printf("normal\n"); */
+   return 0 | AMD64G_FC_MASK_C2 | (sign << AMD64G_FC_SHIFT_C1) | 0;
+}
+
+
+/* Create an x87 FPU state from the guest state, as close as
+   we can approximate it. */
+static
+void do_get_x87 ( /*IN*/VexGuestAMD64State* vex_state,
+                  /*OUT*/UChar* x87_state )
+{
+   Int        i, stno, preg;
+   UInt       tagw;
+   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
+   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
+   Fpu_State* x87     = (Fpu_State*)x87_state;
+   UInt       ftop    = vex_state->guest_FTOP;
+   UInt       c3210   = vex_state->guest_FC3210;
+
+   for (i = 0; i < 14; i++)
+      x87->env[i] = 0;
+
+   x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
+   x87->env[FP_ENV_STAT] 
+      = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
+   x87->env[FP_ENV_CTRL] 
+      = toUShort(amd64g_create_fpucw( vex_state->guest_FPROUND ));
+
+   /* Dump the register stack in ST order. */
+   tagw = 0;
+   for (stno = 0; stno < 8; stno++) {
+      preg = (stno + ftop) & 7;
+      if (vexTags[preg] == 0) {
+         /* register is empty */
+         tagw |= (3 << (2*preg));
+         convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 
+                                 &x87->reg[10*stno] );
+      } else {
+         /* register is full. */
+         tagw |= (0 << (2*preg));
+         convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 
+                                 &x87->reg[10*stno] );
+      }
+   }
+   x87->env[FP_ENV_TAG] = toUShort(tagw);
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest state, writes guest mem) */
+/* NOTE: only handles 32-bit format (no REX.W on the insn) */
+void amd64g_dirtyhelper_FXSAVE ( VexGuestAMD64State* gst, HWord addr )
+{
+   /* Derived from values obtained from
+      vendor_id       : AuthenticAMD
+      cpu family      : 15
+      model           : 12
+      model name      : AMD Athlon(tm) 64 Processor 3200+
+      stepping        : 0
+      cpu MHz         : 2200.000
+      cache size      : 512 KB
+   */
+   /* Somewhat roundabout, but at least it's simple. */
+   Fpu_State tmp;
+   UShort*   addrS = (UShort*)addr;
+   UChar*    addrC = (UChar*)addr;
+   U128*     xmm   = (U128*)(addr + 160);
+   UInt      mxcsr;
+   UShort    fp_tags;
+   UInt      summary_tags;
+   Int       r, stno;
+   UShort    *srcS, *dstS;
+
+   do_get_x87( gst, (UChar*)&tmp );
+   mxcsr = amd64g_create_mxcsr( gst->guest_SSEROUND );
+
+   /* Now build the proper fxsave image from the x87 image we just
+      made. */
+
+   addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
+   addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
+
+   /* set addrS[2] in an endian-independent way */
+   summary_tags = 0;
+   fp_tags = tmp.env[FP_ENV_TAG];
+   for (r = 0; r < 8; r++) {
+      if ( ((fp_tags >> (2*r)) & 3) != 3 )
+         summary_tags |= (1 << r);
+   }
+   addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
+   addrC[5]  = 0; /* pad */
+
+   /* FOP: faulting fpu opcode.  From experimentation, the real CPU
+      does not write this field. (?!) */
+   addrS[3]  = 0; /* BOGUS */
+
+   /* RIP (Last x87 instruction pointer).  From experimentation, the
+      real CPU does not write this field. (?!) */
+   addrS[4]  = 0; /* BOGUS */
+   addrS[5]  = 0; /* BOGUS */
+   addrS[6]  = 0; /* BOGUS */
+   addrS[7]  = 0; /* BOGUS */
+
+   /* RDP (Last x87 data pointer).  From experimentation, the real CPU
+      does not write this field. (?!) */
+   addrS[8]  = 0; /* BOGUS */
+   addrS[9]  = 0; /* BOGUS */
+   addrS[10] = 0; /* BOGUS */
+   addrS[11] = 0; /* BOGUS */
+
+   addrS[12] = toUShort(mxcsr);  /* MXCSR */
+   addrS[13] = toUShort(mxcsr >> 16);
+
+   addrS[14] = 0xFFFF; /* MXCSR mask (lo16) */
+   addrS[15] = 0x0000; /* MXCSR mask (hi16) */
+
+   /* Copy in the FP registers, in ST order. */
+   for (stno = 0; stno < 8; stno++) {
+      srcS = (UShort*)(&tmp.reg[10*stno]);
+      dstS = (UShort*)(&addrS[16 + 8*stno]);
+      dstS[0] = srcS[0];
+      dstS[1] = srcS[1];
+      dstS[2] = srcS[2];
+      dstS[3] = srcS[3];
+      dstS[4] = srcS[4];
+      dstS[5] = 0;
+      dstS[6] = 0;
+      dstS[7] = 0;
+   }
+
+   /* That's the first 160 bytes of the image done.  Now only %xmm0
+      .. %xmm15 remain to be copied.  If the host is big-endian, these
+      need to be byte-swapped. */
+   vassert(host_is_little_endian());
+
+#  define COPY_U128(_dst,_src)                       \
+      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
+           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
+      while (0)
+
+   COPY_U128( xmm[0],  gst->guest_XMM0 );
+   COPY_U128( xmm[1],  gst->guest_XMM1 );
+   COPY_U128( xmm[2],  gst->guest_XMM2 );
+   COPY_U128( xmm[3],  gst->guest_XMM3 );
+   COPY_U128( xmm[4],  gst->guest_XMM4 );
+   COPY_U128( xmm[5],  gst->guest_XMM5 );
+   COPY_U128( xmm[6],  gst->guest_XMM6 );
+   COPY_U128( xmm[7],  gst->guest_XMM7 );
+   COPY_U128( xmm[8],  gst->guest_XMM8 );
+   COPY_U128( xmm[9],  gst->guest_XMM9 );
+   COPY_U128( xmm[10], gst->guest_XMM10 );
+   COPY_U128( xmm[11], gst->guest_XMM11 );
+   COPY_U128( xmm[12], gst->guest_XMM12 );
+   COPY_U128( xmm[13], gst->guest_XMM13 );
+   COPY_U128( xmm[14], gst->guest_XMM14 );
+   COPY_U128( xmm[15], gst->guest_XMM15 );
+
+#  undef COPY_U128
+}
+
+
+/* DIRTY HELPER (writes guest state) */
+/* Initialise the x87 FPU state as per 'finit'. */
+void amd64g_dirtyhelper_FINIT ( VexGuestAMD64State* gst )
+{
+   Int i;
+   gst->guest_FTOP = 0;
+   for (i = 0; i < 8; i++) {
+      gst->guest_FPTAG[i] = 0; /* empty */
+      gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
+   }
+   gst->guest_FPROUND = (ULong)Irrm_NEAREST;
+   gst->guest_FC3210  = 0;
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest memory) */
+ULong amd64g_dirtyhelper_loadF80le ( ULong addrU )
+{
+   ULong f64;
+   convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 );
+   return f64;
+}
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (writes guest memory) */
+void amd64g_dirtyhelper_storeF80le ( ULong addrU, ULong f64 )
+{
+   convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) );
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* CLEAN HELPER */
+/* mxcsr[15:0] contains a SSE native format MXCSR value.
+   Extract from it the required SSEROUND value and any resulting
+   emulation warning, and return (warn << 32) | sseround value.
+*/
+ULong amd64g_check_ldmxcsr ( ULong mxcsr )
+{
+   /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
+   /* NOTE, encoded exactly as per enum IRRoundingMode. */
+   ULong rmode = (mxcsr >> 13) & 3;
+
+   /* Detect any required emulation warnings. */
+   VexEmWarn ew = EmWarn_NONE;
+
+   if ((mxcsr & 0x1F80) != 0x1F80) {
+      /* unmasked exceptions! */
+      ew = EmWarn_X86_sseExns;
+   }
+   else 
+   if (mxcsr & (1<<15)) {
+      /* FZ is set */
+      ew = EmWarn_X86_fz;
+   } 
+   else
+   if (mxcsr & (1<<6)) {
+      /* DAZ is set */
+      ew = EmWarn_X86_daz;
+   }
+
+   return (((ULong)ew) << 32) | ((ULong)rmode);
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* CLEAN HELPER */
+/* Given sseround as an IRRoundingMode value, create a suitable SSE
+   native format MXCSR value. */
+ULong amd64g_create_mxcsr ( ULong sseround )
+{
+   sseround &= 3;
+   return 0x1F80 | (sseround << 13);
+}
+
+
+/* CLEAN HELPER */
+/* fpucw[15:0] contains a x87 native format FPU control word.
+   Extract from it the required FPROUND value and any resulting
+   emulation warning, and return (warn << 32) | fpround value.
+*/
+ULong amd64g_check_fldcw ( ULong fpucw )
+{
+   /* Decide on a rounding mode.  fpucw[11:10] holds it. */
+   /* NOTE, encoded exactly as per enum IRRoundingMode. */
+   ULong rmode = (fpucw >> 10) & 3;
+
+   /* Detect any required emulation warnings. */
+   VexEmWarn ew = EmWarn_NONE;
+
+   if ((fpucw & 0x3F) != 0x3F) {
+      /* unmasked exceptions! */
+      ew = EmWarn_X86_x87exns;
+   }
+   else
+   if (((fpucw >> 8) & 3) != 3) {
+      /* unsupported precision */
+      ew = EmWarn_X86_x87precision;
+   }
+
+   return (((ULong)ew) << 32) | ((ULong)rmode);
+}
+
+
+/* CLEAN HELPER */
+/* Given fpround as an IRRoundingMode value, create a suitable x87
+   native format FPU control word. */
+ULong amd64g_create_fpucw ( ULong fpround )
+{
+   fpround &= 3;
+   return 0x037F | (fpround << 10);
+}
+
+
+/* This is used to implement 'fldenv'.  
+   Reads 28 bytes at x87_state[0 .. 27]. */
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER */
+VexEmWarn amd64g_dirtyhelper_FLDENV ( /*OUT*/VexGuestAMD64State* vex_state,
+                                      /*IN*/HWord x87_state)
+{
+   Int        stno, preg;
+   UInt       tag;
+   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
+   Fpu_State* x87     = (Fpu_State*)x87_state;
+   UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
+   UInt       tagw    = x87->env[FP_ENV_TAG];
+   UInt       fpucw   = x87->env[FP_ENV_CTRL];
+   ULong      c3210   = x87->env[FP_ENV_STAT] & 0x4700;
+   VexEmWarn  ew;
+   ULong      fpround;
+   ULong      pair;
+
+   /* Copy tags */
+   for (stno = 0; stno < 8; stno++) {
+      preg = (stno + ftop) & 7;
+      tag = (tagw >> (2*preg)) & 3;
+      if (tag == 3) {
+         /* register is empty */
+         vexTags[preg] = 0;
+      } else {
+         /* register is non-empty */
+         vexTags[preg] = 1;
+      }
+   }
+
+   /* stack pointer */
+   vex_state->guest_FTOP = ftop;
+
+   /* status word */
+   vex_state->guest_FC3210 = c3210;
+
+   /* handle the control word, setting FPROUND and detecting any
+      emulation warnings. */
+   pair    = amd64g_check_fldcw ( (ULong)fpucw );
+   fpround = pair & 0xFFFFFFFFULL;
+   ew      = (VexEmWarn)(pair >> 32);
+   
+   vex_state->guest_FPROUND = fpround & 3;
+
+   /* emulation warnings --> caller */
+   return ew;
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER */
+/* Create an x87 FPU env from the guest state, as close as we can
+   approximate it.  Writes 28 bytes at x87_state[0..27]. */
+void amd64g_dirtyhelper_FSTENV ( /*IN*/VexGuestAMD64State* vex_state,
+                                 /*OUT*/HWord x87_state )
+{
+   Int        i, stno, preg;
+   UInt       tagw;
+   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
+   Fpu_State* x87     = (Fpu_State*)x87_state;
+   UInt       ftop    = vex_state->guest_FTOP;
+   ULong      c3210   = vex_state->guest_FC3210;
+
+   for (i = 0; i < 14; i++)
+      x87->env[i] = 0;
+
+   x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
+   x87->env[FP_ENV_STAT] 
+      = toUShort(toUInt( ((ftop & 7) << 11) | (c3210 & 0x4700) ));
+   x87->env[FP_ENV_CTRL] 
+      = toUShort(toUInt( amd64g_create_fpucw( vex_state->guest_FPROUND ) ));
+
+   /* Compute the x87 tag word. */
+   tagw = 0;
+   for (stno = 0; stno < 8; stno++) {
+      preg = (stno + ftop) & 7;
+      if (vexTags[preg] == 0) {
+         /* register is empty */
+         tagw |= (3 << (2*preg));
+      } else {
+         /* register is full. */
+         tagw |= (0 << (2*preg));
+      }
+   }
+   x87->env[FP_ENV_TAG] = toUShort(tagw);
+
+   /* We don't dump the x87 registers, tho. */
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Misc integer helpers, including rotates and CPUID.      ---*/
+/*---------------------------------------------------------------*/
+
+/* Claim to be the following CPU, which is probably representative of
+   the lowliest (earliest) amd64 offerings.  It can do neither sse3
+   nor cx16.
+
+   vendor_id       : AuthenticAMD  
+   cpu family      : 15  
+   model           : 5  
+   model name      : AMD Opteron (tm) Processor 848  
+   stepping        : 10  
+   cpu MHz         : 1797.682  
+   cache size      : 1024 KB  
+   fpu             : yes  
+   fpu_exception   : yes  
+   cpuid level     : 1  
+   wp              : yes  
+   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
+                     mtrr pge mca cmov pat pse36 clflush mmx fxsr
+                     sse sse2 syscall nx mmxext lm 3dnowext 3dnow  
+   bogomips        : 3600.62  
+   TLB size        : 1088 4K pages  
+   clflush size    : 64  
+   cache_alignment : 64  
+   address sizes   : 40 bits physical, 48 bits virtual  
+   power management: ts fid vid ttp  
+*/
+void amd64g_dirtyhelper_CPUID_baseline ( VexGuestAMD64State* st )
+{
+#  define SET_ABCD(_a,_b,_c,_d)                \
+      do { st->guest_RAX = (ULong)(_a);        \
+           st->guest_RBX = (ULong)(_b);        \
+           st->guest_RCX = (ULong)(_c);        \
+           st->guest_RDX = (ULong)(_d);        \
+      } while (0)
+
+   switch (0xFFFFFFFF & st->guest_RAX) {
+      case 0x00000000:
+         SET_ABCD(0x00000001, 0x68747541, 0x444d4163, 0x69746e65);
+         break;
+      case 0x00000001:
+         SET_ABCD(0x00000f5a, 0x01000800, 0x00000000, 0x078bfbff);
+         break;
+      case 0x80000000:
+         SET_ABCD(0x80000018, 0x68747541, 0x444d4163, 0x69746e65);
+         break;
+      case 0x80000001:
+         SET_ABCD(0x00000f5a, 0x00000505, 0x00000000, 0xe1d3fbff);
+         break;
+      case 0x80000002:
+         SET_ABCD(0x20444d41, 0x6574704f, 0x206e6f72, 0x296d7428);
+         break;
+      case 0x80000003:
+         SET_ABCD(0x6f725020, 0x73736563, 0x3820726f, 0x00003834);
+         break;
+      case 0x80000004:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000005:
+         SET_ABCD(0xff08ff08, 0xff20ff20, 0x40020140, 0x40020140);
+         break;
+      case 0x80000006:
+         SET_ABCD(0x00000000, 0x42004200, 0x04008140, 0x00000000);
+         break;
+      case 0x80000007:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x0000000f);
+         break;
+      case 0x80000008:
+         SET_ABCD(0x00003028, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      default:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+   }
+#  undef SET_ABCD
+}
+
+
+/* Claim to be the following CPU (2 x ...), which is sse3 and cx16
+   capable.
+
+   vendor_id       : GenuineIntel
+   cpu family      : 6
+   model           : 15
+   model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
+   stepping        : 6
+   cpu MHz         : 2394.000
+   cache size      : 4096 KB
+   physical id     : 0
+   siblings        : 2
+   core id         : 0
+   cpu cores       : 2
+   fpu             : yes
+   fpu_exception   : yes
+   cpuid level     : 10
+   wp              : yes
+   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
+                     mtrr pge mca cmov pat pse36 clflush dts acpi
+                     mmx fxsr sse sse2 ss ht tm syscall nx lm
+                     constant_tsc pni monitor ds_cpl vmx est tm2
+                     cx16 xtpr lahf_lm
+   bogomips        : 4798.78
+   clflush size    : 64
+   cache_alignment : 64
+   address sizes   : 36 bits physical, 48 bits virtual
+   power management:
+*/
+void amd64g_dirtyhelper_CPUID_sse3_and_cx16 ( VexGuestAMD64State* st )
+{
+#  define SET_ABCD(_a,_b,_c,_d)                \
+      do { st->guest_RAX = (ULong)(_a);        \
+           st->guest_RBX = (ULong)(_b);        \
+           st->guest_RCX = (ULong)(_c);        \
+           st->guest_RDX = (ULong)(_d);        \
+      } while (0)
+
+   switch (0xFFFFFFFF & st->guest_RAX) {
+      case 0x00000000:
+         SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
+         break;
+      case 0x00000001:
+         SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
+         break;
+      case 0x00000002:
+         SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
+         break;
+      case 0x00000003:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000004: {
+         switch (0xFFFFFFFF & st->guest_RCX) {
+            case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
+                                      0x0000003f, 0x00000001); break;
+            case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
+                                      0x0000003f, 0x00000001); break;
+            case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
+                                      0x00000fff, 0x00000001); break;
+            default:         SET_ABCD(0x00000000, 0x00000000,
+                                      0x00000000, 0x00000000); break;
+         }
+         break;
+      }
+      case 0x00000005:
+         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
+         break;
+      case 0x00000006:
+         SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
+         break;
+      case 0x00000007:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000008:
+         SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000009:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x0000000a:
+      unhandled_eax_value:
+         SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000000:
+         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000001:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100800);
+         break;
+      case 0x80000002:
+         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
+         break;
+      case 0x80000003:
+         SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
+         break;
+      case 0x80000004:
+         SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
+         break;
+      case 0x80000005:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000006:
+         SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
+         break;
+      case 0x80000007:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000008:
+         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      default:         
+         goto unhandled_eax_value;
+   }
+#  undef SET_ABCD
+}
+
+
+/* Claim to be the following CPU (4 x ...), which is sse4.2 and cx16
+   capable.
+
+   vendor_id       : GenuineIntel
+   cpu family      : 6
+   model           : 37
+   model name      : Intel(R) Core(TM) i5 CPU         670  @ 3.47GHz
+   stepping        : 2
+   cpu MHz         : 3334.000
+   cache size      : 4096 KB
+   physical id     : 0
+   siblings        : 4
+   core id         : 0
+   cpu cores       : 2
+   apicid          : 0
+   initial apicid  : 0
+   fpu             : yes
+   fpu_exception   : yes
+   cpuid level     : 11
+   wp              : yes
+   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
+                     mtrr pge mca cmov pat pse36 clflush dts acpi
+                     mmx fxsr sse sse2 ss ht tm pbe syscall nx rdtscp
+                     lm constant_tsc arch_perfmon pebs bts rep_good
+                     xtopology nonstop_tsc aperfmperf pni pclmulqdq
+                     dtes64 monitor ds_cpl vmx smx est tm2 ssse3 cx16
+                     xtpr pdcm sse4_1 sse4_2 popcnt aes lahf_lm ida
+                     arat tpr_shadow vnmi flexpriority ept vpid
+                     MINUS aes (see below)
+   bogomips        : 6957.57
+   clflush size    : 64
+   cache_alignment : 64
+   address sizes   : 36 bits physical, 48 bits virtual
+   power management:
+*/
+void amd64g_dirtyhelper_CPUID_sse42_and_cx16 ( VexGuestAMD64State* st )
+{
+#  define SET_ABCD(_a,_b,_c,_d)                \
+      do { st->guest_RAX = (ULong)(_a);        \
+           st->guest_RBX = (ULong)(_b);        \
+           st->guest_RCX = (ULong)(_c);        \
+           st->guest_RDX = (ULong)(_d);        \
+      } while (0)
+
+   UInt old_eax = (UInt)st->guest_RAX;
+   UInt old_ecx = (UInt)st->guest_RCX;
+
+   switch (old_eax) {
+      case 0x00000000:
+         SET_ABCD(0x0000000b, 0x756e6547, 0x6c65746e, 0x49656e69);
+         break;
+      case 0x00000001:
+         // & ~(1<<25): don't claim to support AES insns.  See
+         // bug 249991.
+         SET_ABCD(0x00020652, 0x00100800, 0x0298e3ff & ~(1<<25),
+                                          0xbfebfbff);
+         break;
+      case 0x00000002:
+         SET_ABCD(0x55035a01, 0x00f0b2e3, 0x00000000, 0x09ca212c);
+         break;
+      case 0x00000003:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000004:
+         switch (old_ecx) {
+            case 0x00000000: SET_ABCD(0x1c004121, 0x01c0003f,
+                                      0x0000003f, 0x00000000); break;
+            case 0x00000001: SET_ABCD(0x1c004122, 0x00c0003f,
+                                      0x0000007f, 0x00000000); break;
+            case 0x00000002: SET_ABCD(0x1c004143, 0x01c0003f,
+                                      0x000001ff, 0x00000000); break;
+            case 0x00000003: SET_ABCD(0x1c03c163, 0x03c0003f,
+                                      0x00000fff, 0x00000002); break;
+            default:         SET_ABCD(0x00000000, 0x00000000,
+                                      0x00000000, 0x00000000); break;
+         }
+         break;
+      case 0x00000005:
+         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00001120);
+         break;
+      case 0x00000006:
+         SET_ABCD(0x00000007, 0x00000002, 0x00000001, 0x00000000);
+         break;
+      case 0x00000007:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000008:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000009:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x0000000a:
+         SET_ABCD(0x07300403, 0x00000004, 0x00000000, 0x00000603);
+         break;
+      case 0x0000000b:
+         switch (old_ecx) {
+            case 0x00000000:
+               SET_ABCD(0x00000001, 0x00000002,
+                        0x00000100, 0x00000000); break;
+            case 0x00000001:
+               SET_ABCD(0x00000004, 0x00000004,
+                        0x00000201, 0x00000000); break;
+            default:
+               SET_ABCD(0x00000000, 0x00000000,
+                        old_ecx,    0x00000000); break;
+         }
+         break;
+      case 0x0000000c:
+         SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
+         break;
+      case 0x0000000d:
+         switch (old_ecx) {
+            case 0x00000000: SET_ABCD(0x00000001, 0x00000002,
+                                      0x00000100, 0x00000000); break;
+            case 0x00000001: SET_ABCD(0x00000004, 0x00000004,
+                                      0x00000201, 0x00000000); break;
+            default:         SET_ABCD(0x00000000, 0x00000000,
+                                      old_ecx,    0x00000000); break;
+         }
+         break;
+      case 0x80000000:
+         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000001:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x28100800);
+         break;
+      case 0x80000002:
+         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
+         break;
+      case 0x80000003:
+         SET_ABCD(0x35692029, 0x55504320, 0x20202020, 0x20202020);
+         break;
+      case 0x80000004:
+         SET_ABCD(0x30373620, 0x20402020, 0x37342e33, 0x007a4847);
+         break;
+      case 0x80000005:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000006:
+         SET_ABCD(0x00000000, 0x00000000, 0x01006040, 0x00000000);
+         break;
+      case 0x80000007:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000100);
+         break;
+      case 0x80000008:
+         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      default:
+         SET_ABCD(0x00000001, 0x00000002, 0x00000100, 0x00000000);
+         break;
+   }
+#  undef SET_ABCD
+}
+
+
+ULong amd64g_calculate_RCR ( ULong arg, 
+                             ULong rot_amt, 
+                             ULong rflags_in, 
+                             Long  szIN )
+{
+   Bool  wantRflags = toBool(szIN < 0);
+   ULong sz         = wantRflags ? (-szIN) : szIN;
+   ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
+   ULong cf=0, of=0, tempcf;
+
+   switch (sz) {
+      case 8:
+         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
+         of        = ((arg >> 63) ^ cf) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = arg & 1;
+            arg    = (arg >> 1) | (cf << 63);
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         break;
+      case 4:
+         while (tempCOUNT >= 33) tempCOUNT -= 33;
+         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
+         of        = ((arg >> 31) ^ cf) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = arg & 1;
+            arg    = ((arg >> 1) & 0x7FFFFFFFULL) | (cf << 31);
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         break;
+      case 2:
+         while (tempCOUNT >= 17) tempCOUNT -= 17;
+         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
+         of        = ((arg >> 15) ^ cf) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = arg & 1;
+            arg    = ((arg >> 1) & 0x7FFFULL) | (cf << 15);
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         break;
+      case 1:
+         while (tempCOUNT >= 9) tempCOUNT -= 9;
+         cf        = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
+         of        = ((arg >> 7) ^ cf) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = arg & 1;
+            arg    = ((arg >> 1) & 0x7FULL) | (cf << 7);
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         break;
+      default:
+         vpanic("calculate_RCR(amd64g): invalid size");
+   }
+
+   cf &= 1;
+   of &= 1;
+   rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
+   rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
+
+   /* caller can ask to have back either the resulting flags or
+      resulting value, but not both */
+   return wantRflags ? rflags_in : arg;
+}
+
+ULong amd64g_calculate_RCL ( ULong arg, 
+                             ULong rot_amt, 
+                             ULong rflags_in, 
+                             Long  szIN )
+{
+   Bool  wantRflags = toBool(szIN < 0);
+   ULong sz         = wantRflags ? (-szIN) : szIN;
+   ULong tempCOUNT  = rot_amt & (sz == 8 ? 0x3F : 0x1F);
+   ULong cf=0, of=0, tempcf;
+
+   switch (sz) {
+      case 8:
+         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = (arg >> 63) & 1;
+            arg    = (arg << 1) | (cf & 1);
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         of = ((arg >> 63) ^ cf) & 1;
+         break;
+      case 4:
+         while (tempCOUNT >= 33) tempCOUNT -= 33;
+         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = (arg >> 31) & 1;
+            arg    = 0xFFFFFFFFULL & ((arg << 1) | (cf & 1));
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         of = ((arg >> 31) ^ cf) & 1;
+         break;
+      case 2:
+         while (tempCOUNT >= 17) tempCOUNT -= 17;
+         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = (arg >> 15) & 1;
+            arg    = 0xFFFFULL & ((arg << 1) | (cf & 1));
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         of = ((arg >> 15) ^ cf) & 1;
+         break;
+      case 1:
+         while (tempCOUNT >= 9) tempCOUNT -= 9;
+         cf = (rflags_in >> AMD64G_CC_SHIFT_C) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = (arg >> 7) & 1;
+            arg    = 0xFFULL & ((arg << 1) | (cf & 1));
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         of = ((arg >> 7) ^ cf) & 1;
+         break;
+      default: 
+         vpanic("calculate_RCL(amd64g): invalid size");
+   }
+
+   cf &= 1;
+   of &= 1;
+   rflags_in &= ~(AMD64G_CC_MASK_C | AMD64G_CC_MASK_O);
+   rflags_in |= (cf << AMD64G_CC_SHIFT_C) | (of << AMD64G_CC_SHIFT_O);
+
+   return wantRflags ? rflags_in : arg;
+}
+
+/* Taken from gf2x-0.9.5, released under GPLv2+ (later versions LGPLv2+)
+ * svn://scm.gforge.inria.fr/svn/gf2x/trunk/hardware/opteron/gf2x_mul1.h@25
+ */
+ULong amd64g_calculate_pclmul(ULong a, ULong b, ULong which)
+{
+    ULong hi, lo, tmp, A[16];
+
+   A[0] = 0;            A[1] = a;
+   A[2] = A[1] << 1;    A[3] = A[2] ^ a;
+   A[4] = A[2] << 1;    A[5] = A[4] ^ a;
+   A[6] = A[3] << 1;    A[7] = A[6] ^ a;
+   A[8] = A[4] << 1;    A[9] = A[8] ^ a;
+   A[10] = A[5] << 1;   A[11] = A[10] ^ a;
+   A[12] = A[6] << 1;   A[13] = A[12] ^ a;
+   A[14] = A[7] << 1;   A[15] = A[14] ^ a;
+
+   lo = (A[b >> 60] << 4) ^ A[(b >> 56) & 15];
+   hi = lo >> 56;
+   lo = (lo << 8) ^ (A[(b >> 52) & 15] << 4) ^ A[(b >> 48) & 15];
+   hi = (hi << 8) | (lo >> 56);
+   lo = (lo << 8) ^ (A[(b >> 44) & 15] << 4) ^ A[(b >> 40) & 15];
+   hi = (hi << 8) | (lo >> 56);
+   lo = (lo << 8) ^ (A[(b >> 36) & 15] << 4) ^ A[(b >> 32) & 15];
+   hi = (hi << 8) | (lo >> 56);
+   lo = (lo << 8) ^ (A[(b >> 28) & 15] << 4) ^ A[(b >> 24) & 15];
+   hi = (hi << 8) | (lo >> 56);
+   lo = (lo << 8) ^ (A[(b >> 20) & 15] << 4) ^ A[(b >> 16) & 15];
+   hi = (hi << 8) | (lo >> 56);
+   lo = (lo << 8) ^ (A[(b >> 12) & 15] << 4) ^ A[(b >> 8) & 15];
+   hi = (hi << 8) | (lo >> 56);
+   lo = (lo << 8) ^ (A[(b >> 4) & 15] << 4) ^ A[b & 15];
+
+   ULong m0 = -1;
+   m0 /= 255;
+   tmp = -((a >> 63) & 1); tmp &= ((b & (m0 * 0xfe)) >> 1); hi = hi ^ tmp;
+   tmp = -((a >> 62) & 1); tmp &= ((b & (m0 * 0xfc)) >> 2); hi = hi ^ tmp;
+   tmp = -((a >> 61) & 1); tmp &= ((b & (m0 * 0xf8)) >> 3); hi = hi ^ tmp;
+   tmp = -((a >> 60) & 1); tmp &= ((b & (m0 * 0xf0)) >> 4); hi = hi ^ tmp;
+   tmp = -((a >> 59) & 1); tmp &= ((b & (m0 * 0xe0)) >> 5); hi = hi ^ tmp;
+   tmp = -((a >> 58) & 1); tmp &= ((b & (m0 * 0xc0)) >> 6); hi = hi ^ tmp;
+   tmp = -((a >> 57) & 1); tmp &= ((b & (m0 * 0x80)) >> 7); hi = hi ^ tmp;
+
+   return which ? hi : lo;
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (non-referentially-transparent) */
+/* Horrible hack.  On non-amd64 platforms, return 1. */
+ULong amd64g_dirtyhelper_RDTSC ( void )
+{
+#  if defined(__x86_64__)
+   UInt  eax, edx;
+   __asm__ __volatile__("rdtsc" : "=a" (eax), "=d" (edx));
+   return (((ULong)edx) << 32) | ((ULong)eax);
+#  else
+   return 1ULL;
+#  endif
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (non-referentially-transparent) */
+/* Horrible hack.  On non-amd64 platforms, return 0. */
+ULong amd64g_dirtyhelper_IN ( ULong portno, ULong sz/*1,2 or 4*/ )
+{
+#  if defined(__x86_64__)
+   ULong r = 0;
+   portno &= 0xFFFF;
+   switch (sz) {
+      case 4: 
+         __asm__ __volatile__("movq $0,%%rax; inl %w1,%%eax; movq %%rax,%0" 
+                              : "=a" (r) : "Nd" (portno));
+	 break;
+      case 2: 
+         __asm__ __volatile__("movq $0,%%rax; inw %w1,%w0" 
+                              : "=a" (r) : "Nd" (portno));
+	 break;
+      case 1: 
+         __asm__ __volatile__("movq $0,%%rax; inb %w1,%b0" 
+                              : "=a" (r) : "Nd" (portno));
+	 break;
+      default:
+         break; /* note: no 64-bit version of insn exists */
+   }
+   return r;
+#  else
+   return 0;
+#  endif
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (non-referentially-transparent) */
+/* Horrible hack.  On non-amd64 platforms, do nothing. */
+void amd64g_dirtyhelper_OUT ( ULong portno, ULong data, ULong sz/*1,2 or 4*/ )
+{
+#  if defined(__x86_64__)
+   portno &= 0xFFFF;
+   switch (sz) {
+      case 4: 
+         __asm__ __volatile__("movq %0,%%rax; outl %%eax, %w1" 
+                              : : "a" (data), "Nd" (portno));
+	 break;
+      case 2: 
+         __asm__ __volatile__("outw %w0, %w1" 
+                              : : "a" (data), "Nd" (portno));
+	 break;
+      case 1: 
+         __asm__ __volatile__("outb %b0, %w1" 
+                              : : "a" (data), "Nd" (portno));
+	 break;
+      default:
+         break; /* note: no 64-bit version of insn exists */
+   }
+#  else
+   /* do nothing */
+#  endif
+}
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (non-referentially-transparent) */
+/* Horrible hack.  On non-amd64 platforms, do nothing. */
+/* op = 0: call the native SGDT instruction.
+   op = 1: call the native SIDT instruction.
+*/
+void amd64g_dirtyhelper_SxDT ( void *address, ULong op ) {
+#  if defined(__x86_64__)
+   switch (op) {
+      case 0:
+         __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
+         break;
+      case 1:
+         __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
+         break;
+      default:
+         vpanic("amd64g_dirtyhelper_SxDT");
+   }
+#  else
+   /* do nothing */
+   UChar* p = (UChar*)address;
+   p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
+   p[6] = p[7] = p[8] = p[9] = 0;
+#  endif
+}
+
+/*---------------------------------------------------------------*/
+/*--- Helpers for MMX/SSE/SSE2.                               ---*/
+/*---------------------------------------------------------------*/
+
+static inline UChar abdU8 ( UChar xx, UChar yy ) {
+   return toUChar(xx>yy ? xx-yy : yy-xx);
+}
+
+static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
+   return (((ULong)w1) << 32) | ((ULong)w0);
+}
+
+static inline UShort sel16x4_3 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUShort(hi32 >> 16);
+}
+static inline UShort sel16x4_2 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUShort(hi32);
+}
+static inline UShort sel16x4_1 ( ULong w64 ) {
+   UInt lo32 = toUInt(w64);
+   return toUShort(lo32 >> 16);
+}
+static inline UShort sel16x4_0 ( ULong w64 ) {
+   UInt lo32 = toUInt(w64);
+   return toUShort(lo32);
+}
+
+static inline UChar sel8x8_7 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUChar(hi32 >> 24);
+}
+static inline UChar sel8x8_6 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUChar(hi32 >> 16);
+}
+static inline UChar sel8x8_5 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUChar(hi32 >> 8);
+}
+static inline UChar sel8x8_4 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUChar(hi32 >> 0);
+}
+static inline UChar sel8x8_3 ( ULong w64 ) {
+   UInt lo32 = toUInt(w64);
+   return toUChar(lo32 >> 24);
+}
+static inline UChar sel8x8_2 ( ULong w64 ) {
+   UInt lo32 = toUInt(w64);
+   return toUChar(lo32 >> 16);
+}
+static inline UChar sel8x8_1 ( ULong w64 ) {
+   UInt lo32 = toUInt(w64);
+   return toUChar(lo32 >> 8);
+}
+static inline UChar sel8x8_0 ( ULong w64 ) {
+   UInt lo32 = toUInt(w64);
+   return toUChar(lo32 >> 0);
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong amd64g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
+{
+   return
+      mk32x2(
+         (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
+            + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
+         (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
+            + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
+      );
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong amd64g_calculate_mmx_pmovmskb ( ULong xx )
+{
+   ULong r = 0;
+   if (xx & (1ULL << (64-1))) r |= (1<<7);
+   if (xx & (1ULL << (56-1))) r |= (1<<6);
+   if (xx & (1ULL << (48-1))) r |= (1<<5);
+   if (xx & (1ULL << (40-1))) r |= (1<<4);
+   if (xx & (1ULL << (32-1))) r |= (1<<3);
+   if (xx & (1ULL << (24-1))) r |= (1<<2);
+   if (xx & (1ULL << (16-1))) r |= (1<<1);
+   if (xx & (1ULL << ( 8-1))) r |= (1<<0);
+   return r;
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong amd64g_calculate_mmx_psadbw ( ULong xx, ULong yy )
+{
+   UInt t = 0;
+   t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
+   t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
+   t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
+   t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
+   t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
+   t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
+   t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
+   t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
+   t &= 0xFFFF;
+   return (ULong)t;
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong amd64g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo )
+{
+   ULong rHi8 = amd64g_calculate_mmx_pmovmskb ( w64hi );
+   ULong rLo8 = amd64g_calculate_mmx_pmovmskb ( w64lo );
+   return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF);
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Helpers for SSE4.2 PCMP{E,I}STR{I,M}                    ---*/
+/*---------------------------------------------------------------*/
+
+static UInt zmask_from_V128 ( V128* arg )
+{
+   UInt i, res = 0;
+   for (i = 0; i < 16; i++) {
+      res |=  ((arg->w8[i] == 0) ? 1 : 0) << i;
+   }
+   return res;
+}
+
+/* Helps with PCMP{I,E}STR{I,M}.
+
+   CALLED FROM GENERATED CODE: DIRTY HELPER(s).  (But not really,
+   actually it could be a clean helper, but for the fact that we can't
+   pass by value 2 x V128 to a clean helper, nor have one returned.)
+   Reads guest state, writes to guest state for the xSTRM cases, no
+   accesses of memory, is a pure function.
+
+   opc_and_imm contains (4th byte of opcode << 8) | the-imm8-byte so
+   the callee knows which I/E and I/M variant it is dealing with and
+   what the specific operation is.  4th byte of opcode is in the range
+   0x60 to 0x63:
+       istri  66 0F 3A 63
+       istrm  66 0F 3A 62
+       estri  66 0F 3A 61
+       estrm  66 0F 3A 60
+
+   gstOffL and gstOffR are the guest state offsets for the two XMM
+   register inputs.  We never have to deal with the memory case since
+   that is handled by pre-loading the relevant value into the fake
+   XMM16 register.
+
+   For ESTRx variants, edxIN and eaxIN hold the values of those two
+   registers.
+
+   In all cases, the bottom 16 bits of the result contain the new
+   OSZACP %rflags values.  For xSTRI variants, bits[31:16] of the
+   result hold the new %ecx value.  For xSTRM variants, the helper
+   writes the result directly to the guest XMM0.
+
+   Declarable side effects: in all cases, reads guest state at
+   [gstOffL, +16) and [gstOffR, +16).  For xSTRM variants, also writes
+   guest_XMM0.
+
+   Is expected to be called with opc_and_imm combinations which have
+   actually been validated, and will assert if otherwise.  The front
+   end should ensure we're only called with verified values.
+*/
+ULong amd64g_dirtyhelper_PCMPxSTRx ( 
+          VexGuestAMD64State* gst,
+          HWord opc4_and_imm,
+          HWord gstOffL, HWord gstOffR,
+          HWord edxIN, HWord eaxIN
+       )
+{
+   HWord opc4 = (opc4_and_imm >> 8) & 0xFF;
+   HWord imm8 = opc4_and_imm & 0xFF;
+   HWord isISTRx = opc4 & 2;
+   HWord isxSTRM = (opc4 & 1) ^ 1;
+   vassert((opc4 & 0xFC) == 0x60); /* 0x60 .. 0x63 */
+   vassert((imm8 & 1) == 0); /* we support byte-size cases only */
+
+   // where the args are
+   V128* argL = (V128*)( ((UChar*)gst) + gstOffL );
+   V128* argR = (V128*)( ((UChar*)gst) + gstOffR );
+
+   /* Create the arg validity masks, either from the vectors
+      themselves or from the supplied edx/eax values. */
+   // FIXME: this is only right for the 8-bit data cases.
+   // At least that is asserted above.
+   UInt zmaskL, zmaskR;
+   if (isISTRx) {
+      zmaskL = zmask_from_V128(argL);
+      zmaskR = zmask_from_V128(argR);
+   } else {
+      Int tmp;
+      tmp = edxIN & 0xFFFFFFFF;
+      if (tmp < -16) tmp = -16;
+      if (tmp > 16)  tmp = 16;
+      if (tmp < 0)   tmp = -tmp;
+      vassert(tmp >= 0 && tmp <= 16);
+      zmaskL = (1 << tmp) & 0xFFFF;
+      tmp = eaxIN & 0xFFFFFFFF;
+      if (tmp < -16) tmp = -16;
+      if (tmp > 16)  tmp = 16;
+      if (tmp < 0)   tmp = -tmp;
+      vassert(tmp >= 0 && tmp <= 16);
+      zmaskR = (1 << tmp) & 0xFFFF;
+   }
+
+   // temp spot for the resulting flags and vector.
+   V128 resV;
+   UInt resOSZACP;
+
+   // do the meyaath
+   Bool ok = compute_PCMPxSTRx ( 
+                &resV, &resOSZACP, argL, argR, 
+                zmaskL, zmaskR, imm8, (Bool)isxSTRM
+             );
+
+   // front end shouldn't pass us any imm8 variants we can't
+   // handle.  Hence:
+   vassert(ok);
+
+   // So, finally we need to get the results back to the caller.
+   // In all cases, the new OSZACP value is the lowest 16 of
+   // the return value.
+   if (isxSTRM) {
+      /* gst->guest_XMM0 = resV; */ // gcc don't like that
+      gst->guest_XMM0[0] = resV.w32[0];
+      gst->guest_XMM0[1] = resV.w32[1];
+      gst->guest_XMM0[2] = resV.w32[2];
+      gst->guest_XMM0[3] = resV.w32[3];
+      return resOSZACP & 0x8D5;
+   } else {
+      UInt newECX = resV.w32[0] & 0xFFFF;
+      return (newECX << 16) | (resOSZACP & 0x8D5);
+   }
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Helpers for dealing with, and describing,               ---*/
+/*--- guest state as a whole.                                 ---*/
+/*---------------------------------------------------------------*/
+
+/* Initialise the entire amd64 guest state. */
+/* VISIBLE TO LIBVEX CLIENT */
+void LibVEX_GuestAMD64_initialise ( /*OUT*/VexGuestAMD64State* vex_state )
+{
+   vex_state->guest_RAX = 0;
+   vex_state->guest_RCX = 0;
+   vex_state->guest_RDX = 0;
+   vex_state->guest_RBX = 0;
+   vex_state->guest_RSP = 0;
+   vex_state->guest_RBP = 0;
+   vex_state->guest_RSI = 0;
+   vex_state->guest_RDI = 0;
+   vex_state->guest_R8  = 0;
+   vex_state->guest_R9  = 0;
+   vex_state->guest_R10 = 0;
+   vex_state->guest_R11 = 0;
+   vex_state->guest_R12 = 0;
+   vex_state->guest_R13 = 0;
+   vex_state->guest_R14 = 0;
+   vex_state->guest_R15 = 0;
+
+   vex_state->guest_CC_OP   = AMD64G_CC_OP_COPY;
+   vex_state->guest_CC_DEP1 = 0;
+   vex_state->guest_CC_DEP2 = 0;
+   vex_state->guest_CC_NDEP = 0;
+
+   vex_state->guest_DFLAG   = 1; /* forwards */
+   vex_state->guest_IDFLAG  = 0;
+
+   /* HACK: represent the offset associated with %fs==0. This
+      assumes that %fs is only ever zero. */
+   vex_state->guest_FS_ZERO = 0;
+
+   vex_state->guest_RIP = 0;
+
+   /* Initialise the simulated FPU */
+   amd64g_dirtyhelper_FINIT( vex_state );
+
+   /* Initialise the SSE state. */
+#  define SSEZERO(_xmm) _xmm[0]=_xmm[1]=_xmm[2]=_xmm[3] = 0;
+
+   vex_state->guest_SSEROUND = (ULong)Irrm_NEAREST;
+   SSEZERO(vex_state->guest_XMM0);
+   SSEZERO(vex_state->guest_XMM1);
+   SSEZERO(vex_state->guest_XMM2);
+   SSEZERO(vex_state->guest_XMM3);
+   SSEZERO(vex_state->guest_XMM4);
+   SSEZERO(vex_state->guest_XMM5);
+   SSEZERO(vex_state->guest_XMM6);
+   SSEZERO(vex_state->guest_XMM7);
+   SSEZERO(vex_state->guest_XMM8);
+   SSEZERO(vex_state->guest_XMM9);
+   SSEZERO(vex_state->guest_XMM10);
+   SSEZERO(vex_state->guest_XMM11);
+   SSEZERO(vex_state->guest_XMM12);
+   SSEZERO(vex_state->guest_XMM13);
+   SSEZERO(vex_state->guest_XMM14);
+   SSEZERO(vex_state->guest_XMM15);
+   SSEZERO(vex_state->guest_XMM16);
+
+#  undef SSEZERO
+
+   vex_state->guest_EMWARN = EmWarn_NONE;
+
+   /* These should not ever be either read or written, but we
+      initialise them anyway. */
+   vex_state->guest_TISTART = 0;
+   vex_state->guest_TILEN   = 0;
+
+   vex_state->guest_NRADDR   = 0;
+   vex_state->guest_SC_CLASS = 0;
+   vex_state->guest_GS_0x60  = 0;
+
+   vex_state->guest_IP_AT_SYSCALL = 0;
+   /* vex_state->padding = 0; */
+}
+
+
+/* Figure out if any part of the guest state contained in minoff
+   .. maxoff requires precise memory exceptions.  If in doubt return
+   True (but this is generates significantly slower code).  
+
+   By default we enforce precise exns for guest %RSP, %RBP and %RIP
+   only.  These are the minimum needed to extract correct stack
+   backtraces from amd64 code.
+*/
+Bool guest_amd64_state_requires_precise_mem_exns ( Int minoff,
+                                                   Int maxoff)
+{
+   Int rbp_min = offsetof(VexGuestAMD64State, guest_RBP);
+   Int rbp_max = rbp_min + 8 - 1;
+   Int rsp_min = offsetof(VexGuestAMD64State, guest_RSP);
+   Int rsp_max = rsp_min + 8 - 1;
+   Int rip_min = offsetof(VexGuestAMD64State, guest_RIP);
+   Int rip_max = rip_min + 8 - 1;
+
+   if (maxoff < rbp_min || minoff > rbp_max) {
+      /* no overlap with rbp */
+   } else {
+      return True;
+   }
+
+   if (maxoff < rsp_min || minoff > rsp_max) {
+      /* no overlap with rsp */
+   } else {
+      return True;
+   }
+
+   if (maxoff < rip_min || minoff > rip_max) {
+      /* no overlap with eip */
+   } else {
+      return True;
+   }
+
+   return False;
+}
+
+
+#define ALWAYSDEFD(field)                             \
+    { offsetof(VexGuestAMD64State, field),            \
+      (sizeof ((VexGuestAMD64State*)0)->field) }
+
+VexGuestLayout
+   amd64guest_layout
+      = {
+          /* Total size of the guest state, in bytes. */
+          .total_sizeB = sizeof(VexGuestAMD64State),
+
+          /* Describe the stack pointer. */
+          .offset_SP = offsetof(VexGuestAMD64State,guest_RSP),
+          .sizeof_SP = 8,
+
+          /* Describe the frame pointer. */
+          .offset_FP = offsetof(VexGuestAMD64State,guest_RBP),
+          .sizeof_FP = 8,
+
+          /* Describe the instruction pointer. */
+          .offset_IP = offsetof(VexGuestAMD64State,guest_RIP),
+          .sizeof_IP = 8,
+
+          /* Describe any sections to be regarded by Memcheck as
+             'always-defined'. */
+          .n_alwaysDefd = 16,
+
+          /* flags thunk: OP and NDEP are always defd, whereas DEP1
+             and DEP2 have to be tracked.  See detailed comment in
+             gdefs.h on meaning of thunk fields. */
+          .alwaysDefd
+             = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
+                 /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
+		 /*  2 */ ALWAYSDEFD(guest_DFLAG),
+                 /*  3 */ ALWAYSDEFD(guest_IDFLAG),
+                 /*  4 */ ALWAYSDEFD(guest_RIP),
+                 /*  5 */ ALWAYSDEFD(guest_FS_ZERO),
+                 /*  6 */ ALWAYSDEFD(guest_FTOP),
+                 /*  7 */ ALWAYSDEFD(guest_FPTAG),
+                 /*  8 */ ALWAYSDEFD(guest_FPROUND),
+                 /*  9 */ ALWAYSDEFD(guest_FC3210),
+                 // /* */ ALWAYSDEFD(guest_CS),
+                 // /* */ ALWAYSDEFD(guest_DS),
+                 // /* */ ALWAYSDEFD(guest_ES),
+                 // /* */ ALWAYSDEFD(guest_FS),
+                 // /* */ ALWAYSDEFD(guest_GS),
+                 // /* */ ALWAYSDEFD(guest_SS),
+                 // /* */ ALWAYSDEFD(guest_LDT),
+                 // /* */ ALWAYSDEFD(guest_GDT),
+                 /* 10 */ ALWAYSDEFD(guest_EMWARN),
+                 /* 11 */ ALWAYSDEFD(guest_SSEROUND),
+                 /* 12 */ ALWAYSDEFD(guest_TISTART),
+                 /* 13 */ ALWAYSDEFD(guest_TILEN),
+                 /* 14 */ ALWAYSDEFD(guest_SC_CLASS),
+                 /* 15 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
+               }
+        };
+
+
+/*---------------------------------------------------------------*/
+/*--- end                               guest_amd64_helpers.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/guest_amd64_toIR.c b/VEX/priv/guest_amd64_toIR.c
new file mode 100644
index 0000000..79b1269
--- /dev/null
+++ b/VEX/priv/guest_amd64_toIR.c

@@ -0,0 +1,18294 @@
+
+/*--------------------------------------------------------------------*/
+/*--- begin                                     guest_amd64_toIR.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+/* Translates AMD64 code to IR. */
+
+/* TODO:
+
+   All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
+   to ensure a 64-bit value is being written.
+
+   x87 FP Limitations:
+ 
+   * all arithmetic done at 64 bits
+ 
+   * no FP exceptions, except for handling stack over/underflow
+ 
+   * FP rounding mode observed only for float->int conversions and
+     int->float conversions which could lose accuracy, and for
+     float-to-float rounding.  For all other operations,
+     round-to-nearest is used, regardless.
+ 
+   * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
+     simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
+     even when it isn't.
+ 
+   * some of the FCOM cases could do with testing -- not convinced
+     that the args are the right way round.
+ 
+   * FSAVE does not re-initialise the FPU; it should do
+ 
+   * FINIT not only initialises the FPU environment, it also zeroes
+     all the FP registers.  It should leave the registers unchanged.
+ 
+    RDTSC returns zero, always.
+ 
+    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
+    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
+    only way to observe eflags[1], a proper fix would be to make that
+    bit be set by PUSHF.
+ 
+    This module uses global variables and so is not MT-safe (if that
+    should ever become relevant).
+*/
+
+/* Notes re address size overrides (0x67).
+
+   According to the AMD documentation (24594 Rev 3.09, Sept 2003,
+   "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
+   and System Instructions"), Section 1.2.3 ("Address-Size Override
+   Prefix"):
+
+   0x67 applies to all explicit memory references, causing the top
+   32 bits of the effective address to become zero.
+
+   0x67 has no effect on stack references (push/pop); these always
+   use a 64-bit address.
+
+   0x67 changes the interpretation of instructions which implicitly
+   reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
+   instead.  These are:
+
+      cmp{s,sb,sw,sd,sq}
+      in{s,sb,sw,sd}
+      jcxz, jecxz, jrcxz
+      lod{s,sb,sw,sd,sq}
+      loop{,e,bz,be,z}
+      mov{s,sb,sw,sd,sq}
+      out{s,sb,sw,sd}
+      rep{,e,ne,nz}
+      sca{s,sb,sw,sd,sq}
+      sto{s,sb,sw,sd,sq}
+      xlat{,b} */
+
+/* "Special" instructions.
+
+   This instruction decoder can decode three special instructions
+   which mean nothing natively (are no-ops as far as regs/mem are
+   concerned) but have meaning for supporting Valgrind.  A special
+   instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
+   48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
+   $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
+   Following that, one of the following 3 are allowed (standard
+   interpretation in parentheses):
+
+      4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
+      4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
+      4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
+
+   Any other bytes following the 16-byte preamble are illegal and
+   constitute a failure in instruction decoding.  This all assumes
+   that the preamble will never occur except in specific code
+   fragments designed for Valgrind to catch.
+
+   No prefixes may precede a "Special" instruction.
+*/
+
+/* casLE (implementation of lock-prefixed insns) and rep-prefixed
+   insns: the side-exit back to the start of the insn is done with
+   Ijk_Boring.  This is quite wrong, it should be done with
+   Ijk_NoRedir, since otherwise the side exit, which is intended to
+   restart the instruction for whatever reason, could go somewhere
+   entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
+   no-redir jumps performance critical, at least for rep-prefixed
+   instructions, since all iterations thereof would involve such a
+   jump.  It's not such a big deal with casLE since the side exit is
+   only taken if the CAS fails, that is, the location is contended,
+   which is relatively unlikely.
+
+   Note also, the test for CAS success vs failure is done using
+   Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
+   Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
+   shouldn't definedness-check these comparisons.  See
+   COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
+   background/rationale.
+*/
+
+/* LOCK prefixed instructions.  These are translated using IR-level
+   CAS statements (IRCAS) and are believed to preserve atomicity, even
+   from the point of view of some other process racing against a
+   simulated one (presumably they communicate via a shared memory
+   segment).
+
+   Handlers which are aware of LOCK prefixes are:
+      dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
+      dis_cmpxchg_G_E  (cmpxchg)
+      dis_Grp1         (add, or, adc, sbb, and, sub, xor)
+      dis_Grp3         (not, neg)
+      dis_Grp4         (inc, dec)
+      dis_Grp5         (inc, dec)
+      dis_Grp8_Imm     (bts, btc, btr)
+      dis_bt_G_E       (bts, btc, btr)
+      dis_xadd_G_E     (xadd)
+*/
+
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+#include "libvex_guest_amd64.h"
+
+#include "main_util.h"
+#include "main_globals.h"
+#include "guest_generic_bb_to_IR.h"
+#include "guest_generic_x87.h"
+#include "guest_amd64_defs.h"
+
+
+/*------------------------------------------------------------*/
+/*--- Globals                                              ---*/
+/*------------------------------------------------------------*/
+
+/* These are set at the start of the translation of an insn, right
+   down in disInstr_AMD64, so that we don't have to pass them around
+   endlessly.  They are all constant during the translation of any
+   given insn. */
+
+/* These are set at the start of the translation of a BB, so
+   that we don't have to pass them around endlessly. */
+
+/* We need to know this to do sub-register accesses correctly. */
+static Bool host_is_bigendian;
+
+/* Pointer to the guest code area (points to start of BB, not to the
+   insn being processed). */
+static UChar* guest_code;
+
+/* The guest address corresponding to guest_code[0]. */
+static Addr64 guest_RIP_bbstart;
+
+/* The guest address for the instruction currently being
+   translated. */
+static Addr64 guest_RIP_curr_instr;
+
+/* The IRSB* into which we're generating code. */
+static IRSB* irsb;
+
+/* For ensuring that %rip-relative addressing is done right.  A read
+   of %rip generates the address of the next instruction.  It may be
+   that we don't conveniently know that inside disAMode().  For sanity
+   checking, if the next insn %rip is needed, we make a guess at what
+   it is, record that guess here, and set the accompanying Bool to
+   indicate that -- after this insn's decode is finished -- that guess
+   needs to be checked.  */
+
+/* At the start of each insn decode, is set to (0, False).
+   After the decode, if _mustcheck is now True, _assumed is
+   checked. */
+
+static Addr64 guest_RIP_next_assumed;
+static Bool   guest_RIP_next_mustcheck;
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for constructing IR.                         ---*/
+/*------------------------------------------------------------*/
+ 
+/* Generate a new temporary of the given type. */
+static IRTemp newTemp ( IRType ty )
+{
+   vassert(isPlausibleIRType(ty));
+   return newIRTemp( irsb->tyenv, ty );
+}
+
+/* Add a statement to the list held by "irsb". */
+static void stmt ( IRStmt* st )
+{
+   addStmtToIRSB( irsb, st );
+}
+
+/* Generate a statement "dst := e". */ 
+static void assign ( IRTemp dst, IRExpr* e )
+{
+   stmt( IRStmt_WrTmp(dst, e) );
+}
+
+static IRExpr* unop ( IROp op, IRExpr* a )
+{
+   return IRExpr_Unop(op, a);
+}
+
+static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
+{
+   return IRExpr_Binop(op, a1, a2);
+}
+
+static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
+{
+   return IRExpr_Triop(op, a1, a2, a3);
+}
+
+static IRExpr* mkexpr ( IRTemp tmp )
+{
+   return IRExpr_RdTmp(tmp);
+}
+
+static IRExpr* mkU8 ( ULong i )
+{
+   vassert(i < 256);
+   return IRExpr_Const(IRConst_U8( (UChar)i ));
+}
+
+static IRExpr* mkU16 ( ULong i )
+{
+   vassert(i < 0x10000ULL);
+   return IRExpr_Const(IRConst_U16( (UShort)i ));
+}
+
+static IRExpr* mkU32 ( ULong i )
+{
+   vassert(i < 0x100000000ULL);
+   return IRExpr_Const(IRConst_U32( (UInt)i ));
+}
+
+static IRExpr* mkU64 ( ULong i )
+{
+   return IRExpr_Const(IRConst_U64(i));
+}
+
+static IRExpr* mkU ( IRType ty, ULong i )
+{
+   switch (ty) {
+      case Ity_I8:  return mkU8(i);
+      case Ity_I16: return mkU16(i);
+      case Ity_I32: return mkU32(i);
+      case Ity_I64: return mkU64(i);
+      default: vpanic("mkU(amd64)");
+   }
+}
+
+static void storeLE ( IRExpr* addr, IRExpr* data )
+{
+   stmt( IRStmt_Store(Iend_LE, addr, data) );
+}
+
+static IRExpr* loadLE ( IRType ty, IRExpr* addr )
+{
+   return IRExpr_Load(Iend_LE, ty, addr);
+}
+
+static IROp mkSizedOp ( IRType ty, IROp op8 )
+{
+   vassert(op8 == Iop_Add8 || op8 == Iop_Sub8 
+           || op8 == Iop_Mul8 
+           || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
+           || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
+           || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
+           || op8 == Iop_CasCmpNE8
+           || op8 == Iop_Not8 );
+   switch (ty) {
+      case Ity_I8:  return 0 +op8;
+      case Ity_I16: return 1 +op8;
+      case Ity_I32: return 2 +op8;
+      case Ity_I64: return 3 +op8;
+      default: vpanic("mkSizedOp(amd64)");
+   }
+}
+
+static 
+IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
+{
+   if (szSmall == 1 && szBig == 4) {
+      return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
+   }
+   if (szSmall == 1 && szBig == 2) {
+      return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
+   }
+   if (szSmall == 2 && szBig == 4) {
+      return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
+   }
+   if (szSmall == 1 && szBig == 8 && !signd) {
+      return unop(Iop_8Uto64, src);
+   }
+   if (szSmall == 1 && szBig == 8 && signd) {
+      return unop(Iop_8Sto64, src);
+   }
+   if (szSmall == 2 && szBig == 8 && !signd) {
+      return unop(Iop_16Uto64, src);
+   }
+   if (szSmall == 2 && szBig == 8 && signd) {
+      return unop(Iop_16Sto64, src);
+   }
+   vpanic("doScalarWidening(amd64)");
+}
+
+
+
+/*------------------------------------------------------------*/
+/*--- Debugging output                                     ---*/
+/*------------------------------------------------------------*/
+
+/* Bomb out if we can't handle something. */
+__attribute__ ((noreturn))
+static void unimplemented ( HChar* str )
+{
+   vex_printf("amd64toIR: unimplemented feature\n");
+   vpanic(str);
+}
+
+#define DIP(format, args...)           \
+   if (vex_traceflags & VEX_TRACE_FE)  \
+      vex_printf(format, ## args)
+
+#define DIS(buf, format, args...)      \
+   if (vex_traceflags & VEX_TRACE_FE)  \
+      vex_sprintf(buf, format, ## args)
+
+
+/*------------------------------------------------------------*/
+/*--- Offsets of various parts of the amd64 guest state.   ---*/
+/*------------------------------------------------------------*/
+
+#define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
+#define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
+#define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
+#define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
+#define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
+#define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
+#define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
+#define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
+#define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
+#define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
+#define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
+#define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
+#define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
+#define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
+#define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
+#define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
+
+#define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
+
+#define OFFB_FS_ZERO   offsetof(VexGuestAMD64State,guest_FS_ZERO)
+#define OFFB_GS_0x60   offsetof(VexGuestAMD64State,guest_GS_0x60)
+
+#define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
+#define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
+#define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
+#define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
+
+#define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
+#define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
+#define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
+#define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
+#define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
+#define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
+#define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
+#define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
+//.. 
+//.. #define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
+//.. #define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
+//.. #define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
+//.. #define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
+//.. #define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
+//.. #define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
+//.. #define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
+//.. #define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
+
+#define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
+#define OFFB_XMM0      offsetof(VexGuestAMD64State,guest_XMM0)
+#define OFFB_XMM1      offsetof(VexGuestAMD64State,guest_XMM1)
+#define OFFB_XMM2      offsetof(VexGuestAMD64State,guest_XMM2)
+#define OFFB_XMM3      offsetof(VexGuestAMD64State,guest_XMM3)
+#define OFFB_XMM4      offsetof(VexGuestAMD64State,guest_XMM4)
+#define OFFB_XMM5      offsetof(VexGuestAMD64State,guest_XMM5)
+#define OFFB_XMM6      offsetof(VexGuestAMD64State,guest_XMM6)
+#define OFFB_XMM7      offsetof(VexGuestAMD64State,guest_XMM7)
+#define OFFB_XMM8      offsetof(VexGuestAMD64State,guest_XMM8)
+#define OFFB_XMM9      offsetof(VexGuestAMD64State,guest_XMM9)
+#define OFFB_XMM10     offsetof(VexGuestAMD64State,guest_XMM10)
+#define OFFB_XMM11     offsetof(VexGuestAMD64State,guest_XMM11)
+#define OFFB_XMM12     offsetof(VexGuestAMD64State,guest_XMM12)
+#define OFFB_XMM13     offsetof(VexGuestAMD64State,guest_XMM13)
+#define OFFB_XMM14     offsetof(VexGuestAMD64State,guest_XMM14)
+#define OFFB_XMM15     offsetof(VexGuestAMD64State,guest_XMM15)
+#define OFFB_XMM16     offsetof(VexGuestAMD64State,guest_XMM16)
+
+#define OFFB_EMWARN    offsetof(VexGuestAMD64State,guest_EMWARN)
+#define OFFB_TISTART   offsetof(VexGuestAMD64State,guest_TISTART)
+#define OFFB_TILEN     offsetof(VexGuestAMD64State,guest_TILEN)
+
+#define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
+
+
+/*------------------------------------------------------------*/
+/*--- Helper bits and pieces for deconstructing the        ---*/
+/*--- amd64 insn stream.                                   ---*/
+/*------------------------------------------------------------*/
+
+/* This is the AMD64 register encoding -- integer regs. */
+#define R_RAX 0
+#define R_RCX 1
+#define R_RDX 2
+#define R_RBX 3
+#define R_RSP 4
+#define R_RBP 5
+#define R_RSI 6
+#define R_RDI 7
+#define R_R8  8
+#define R_R9  9
+#define R_R10 10
+#define R_R11 11
+#define R_R12 12
+#define R_R13 13
+#define R_R14 14
+#define R_R15 15
+
+//.. #define R_AL (0+R_EAX)
+//.. #define R_AH (4+R_EAX)
+
+/* This is the Intel register encoding -- segment regs. */
+#define R_ES 0
+#define R_CS 1
+#define R_SS 2
+#define R_DS 3
+#define R_FS 4
+#define R_GS 5
+
+
+/* Various simple conversions */
+
+static ULong extend_s_8to64 ( UChar x )
+{
+   return (ULong)((((Long)x) << 56) >> 56);
+}
+
+static ULong extend_s_16to64 ( UShort x )
+{
+   return (ULong)((((Long)x) << 48) >> 48);
+}
+
+static ULong extend_s_32to64 ( UInt x )
+{
+   return (ULong)((((Long)x) << 32) >> 32);
+}
+
+/* Figure out whether the mod and rm parts of a modRM byte refer to a
+   register or memory.  If so, the byte will have the form 11XXXYYY,
+   where YYY is the register number. */
+inline
+static Bool epartIsReg ( UChar mod_reg_rm )
+{
+   return toBool(0xC0 == (mod_reg_rm & 0xC0));
+}
+
+/* Extract the 'g' field from a modRM byte.  This only produces 3
+   bits, which is not a complete register number.  You should avoid
+   this function if at all possible. */
+inline
+static Int gregLO3ofRM ( UChar mod_reg_rm )
+{
+   return (Int)( (mod_reg_rm >> 3) & 7 );
+}
+
+/* Ditto the 'e' field of a modRM byte. */
+inline
+static Int eregLO3ofRM ( UChar mod_reg_rm )
+{
+   return (Int)(mod_reg_rm & 0x7);
+}
+
+/* Get a 8/16/32-bit unsigned value out of the insn stream. */
+
+static UChar getUChar ( Long delta )
+{
+   UChar v = guest_code[delta+0];
+   return v;
+}
+
+static UInt getUDisp16 ( Long delta )
+{
+   UInt v = guest_code[delta+1]; v <<= 8;
+   v |= guest_code[delta+0];
+   return v & 0xFFFF;
+}
+
+//.. static UInt getUDisp ( Int size, Long delta )
+//.. {
+//..    switch (size) {
+//..       case 4: return getUDisp32(delta);
+//..       case 2: return getUDisp16(delta);
+//..       case 1: return getUChar(delta);
+//..       default: vpanic("getUDisp(x86)");
+//..    }
+//..    return 0; /*notreached*/
+//.. }
+
+
+/* Get a byte value out of the insn stream and sign-extend to 64
+   bits. */
+static Long getSDisp8 ( Long delta )
+{
+   return extend_s_8to64( guest_code[delta] );
+}
+
+/* Get a 16-bit value out of the insn stream and sign-extend to 64
+   bits. */
+static Long getSDisp16 ( Long delta )
+{
+   UInt v = guest_code[delta+1]; v <<= 8;
+   v |= guest_code[delta+0];
+   return extend_s_16to64( (UShort)v );
+}
+
+/* Get a 32-bit value out of the insn stream and sign-extend to 64
+   bits. */
+static Long getSDisp32 ( Long delta )
+{
+   UInt v = guest_code[delta+3]; v <<= 8;
+   v |= guest_code[delta+2]; v <<= 8;
+   v |= guest_code[delta+1]; v <<= 8;
+   v |= guest_code[delta+0];
+   return extend_s_32to64( v );
+}
+
+/* Get a 64-bit value out of the insn stream. */
+static Long getDisp64 ( Long delta )
+{
+   ULong v = 0;
+   v |= guest_code[delta+7]; v <<= 8;
+   v |= guest_code[delta+6]; v <<= 8;
+   v |= guest_code[delta+5]; v <<= 8;
+   v |= guest_code[delta+4]; v <<= 8;
+   v |= guest_code[delta+3]; v <<= 8;
+   v |= guest_code[delta+2]; v <<= 8;
+   v |= guest_code[delta+1]; v <<= 8;
+   v |= guest_code[delta+0];
+   return v;
+}
+
+/* Note: because AMD64 doesn't allow 64-bit literals, it is an error
+   if this is called with size==8.  Should not happen. */
+static Long getSDisp ( Int size, Long delta )
+{
+   switch (size) {
+      case 4: return getSDisp32(delta);
+      case 2: return getSDisp16(delta);
+      case 1: return getSDisp8(delta);
+      default: vpanic("getSDisp(amd64)");
+  }
+}
+
+static ULong mkSizeMask ( Int sz )
+{
+   switch (sz) {
+      case 1: return 0x00000000000000FFULL;
+      case 2: return 0x000000000000FFFFULL;
+      case 4: return 0x00000000FFFFFFFFULL;
+      case 8: return 0xFFFFFFFFFFFFFFFFULL;
+      default: vpanic("mkSzMask(amd64)");
+   }
+}
+
+static Int imin ( Int a, Int b )
+{
+   return (a < b) ? a : b;
+}
+
+static IRType szToITy ( Int n )
+{
+   switch (n) {
+      case 1: return Ity_I8;
+      case 2: return Ity_I16;
+      case 4: return Ity_I32;
+      case 8: return Ity_I64;
+      default: vex_printf("\nszToITy(%d)\n", n);
+               vpanic("szToITy(amd64)");
+   }
+}
+
+
+/*------------------------------------------------------------*/
+/*--- For dealing with prefixes.                           ---*/
+/*------------------------------------------------------------*/
+
+/* The idea is to pass around an int holding a bitmask summarising
+   info from the prefixes seen on the current instruction, including
+   info from the REX byte.  This info is used in various places, but
+   most especially when making sense of register fields in
+   instructions.
+
+   The top 16 bits of the prefix are 0x3141, just as a hacky way
+   to ensure it really is a valid prefix.
+
+   Things you can safely assume about a well-formed prefix:
+   * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
+   * if REX is not present then REXW,REXR,REXX,REXB will read
+     as zero.
+   * F2 and F3 will not both be 1.
+*/
+
+typedef UInt  Prefix;
+
+#define PFX_ASO   (1<<0)     /* address-size override present (0x67) */
+#define PFX_66    (1<<1)     /* operand-size override-to-16 present (0x66) */
+#define PFX_REX   (1<<2)     /* REX byte present (0x40 to 0x4F) */
+#define PFX_REXW  (1<<3)     /* REX W bit, if REX present, else 0 */
+#define PFX_REXR  (1<<4)     /* REX R bit, if REX present, else 0 */
+#define PFX_REXX  (1<<5)     /* REX X bit, if REX present, else 0 */
+#define PFX_REXB  (1<<6)     /* REX B bit, if REX present, else 0 */
+#define PFX_LOCK  (1<<7)     /* bus LOCK prefix present (0xF0) */
+#define PFX_F2    (1<<8)     /* REP/REPE/REPZ prefix present (0xF2) */
+#define PFX_F3    (1<<9)     /* REPNE/REPNZ prefix present (0xF3) */
+#define PFX_CS    (1<<10)    /* CS segment prefix present (0x2E) */
+#define PFX_DS    (1<<11)    /* DS segment prefix present (0x3E) */
+#define PFX_ES    (1<<12)    /* ES segment prefix present (0x26) */
+#define PFX_FS    (1<<13)    /* FS segment prefix present (0x64) */
+#define PFX_GS    (1<<14)    /* GS segment prefix present (0x65) */
+#define PFX_SS    (1<<15)    /* SS segment prefix present (0x36) */
+
+#define PFX_EMPTY 0x31410000
+
+static Bool IS_VALID_PFX ( Prefix pfx ) {
+   return toBool((pfx & 0xFFFF0000) == PFX_EMPTY);
+}
+
+static Bool haveREX ( Prefix pfx ) {
+   return toBool(pfx & PFX_REX);
+}
+
+static Int getRexW ( Prefix pfx ) {
+   return (pfx & PFX_REXW) ? 1 : 0;
+}
+/* Apparently unused.
+static Int getRexR ( Prefix pfx ) {
+   return (pfx & PFX_REXR) ? 1 : 0;
+}
+*/
+static Int getRexX ( Prefix pfx ) {
+   return (pfx & PFX_REXX) ? 1 : 0;
+}
+static Int getRexB ( Prefix pfx ) {
+   return (pfx & PFX_REXB) ? 1 : 0;
+}
+
+/* Check a prefix doesn't have F2 or F3 set in it, since usually that
+   completely changes what instruction it really is. */
+static Bool haveF2orF3 ( Prefix pfx ) {
+   return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
+}
+static Bool haveF2 ( Prefix pfx ) {
+   return toBool((pfx & PFX_F2) > 0);
+}
+static Bool haveF3 ( Prefix pfx ) {
+   return toBool((pfx & PFX_F3) > 0);
+}
+
+static Bool have66 ( Prefix pfx ) {
+   return toBool((pfx & PFX_66) > 0);
+}
+static Bool haveASO ( Prefix pfx ) {
+   return toBool((pfx & PFX_ASO) > 0);
+}
+
+/* Return True iff pfx has 66 set and F2 and F3 clear */
+static Bool have66noF2noF3 ( Prefix pfx )
+{
+  return 
+     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
+}
+
+/* Return True iff pfx has F2 set and 66 and F3 clear */
+static Bool haveF2no66noF3 ( Prefix pfx )
+{
+  return 
+     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
+}
+
+/* Return True iff pfx has F3 set and 66 and F2 clear */
+static Bool haveF3no66noF2 ( Prefix pfx )
+{
+  return 
+     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
+}
+
+/* Return True iff pfx has F3 set and F2 clear */
+static Bool haveF3noF2 ( Prefix pfx )
+{
+  return 
+     toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
+}
+
+/* Return True iff pfx has 66, F2 and F3 clear */
+static Bool haveNo66noF2noF3 ( Prefix pfx )
+{
+  return 
+     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
+}
+
+/* Return True iff pfx has any of 66, F2 and F3 set */
+static Bool have66orF2orF3 ( Prefix pfx )
+{
+  return toBool( ! haveNo66noF2noF3(pfx) );
+}
+
+/* Return True iff pfx has 66 or F2 set */
+static Bool have66orF2 ( Prefix pfx )
+{
+   return toBool((pfx & (PFX_66|PFX_F2)) > 0);
+}
+
+/* Clear all the segment-override bits in a prefix. */
+static Prefix clearSegBits ( Prefix p )
+{
+   return 
+      p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
+}
+
+
+/*------------------------------------------------------------*/
+/*--- For dealing with integer registers                   ---*/
+/*------------------------------------------------------------*/
+
+/* This is somewhat complex.  The rules are:
+
+   For 64, 32 and 16 bit register references, the e or g fields in the
+   modrm bytes supply the low 3 bits of the register number.  The
+   fourth (most-significant) bit of the register number is supplied by
+   the REX byte, if it is present; else that bit is taken to be zero.
+
+   The REX.R bit supplies the high bit corresponding to the g register
+   field, and the REX.B bit supplies the high bit corresponding to the
+   e register field (when the mod part of modrm indicates that modrm's
+   e component refers to a register and not to memory).
+
+   The REX.X bit supplies a high register bit for certain registers
+   in SIB address modes, and is generally rarely used.
+
+   For 8 bit register references, the presence of the REX byte itself
+   has significance.  If there is no REX present, then the 3-bit
+   number extracted from the modrm e or g field is treated as an index
+   into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
+   old x86 encoding scheme.
+
+   But if there is a REX present, the register reference is
+   interpreted in the same way as for 64/32/16-bit references: a high
+   bit is extracted from REX, giving a 4-bit number, and the denoted
+   register is the lowest 8 bits of the 16 integer registers denoted
+   by the number.  In particular, values 3 through 7 of this sequence
+   do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
+   %rsp %rbp %rsi %rdi.
+
+   The REX.W bit has no bearing at all on register numbers.  Instead
+   its presence indicates that the operand size is to be overridden
+   from its default value (32 bits) to 64 bits instead.  This is in
+   the same fashion that an 0x66 prefix indicates the operand size is
+   to be overridden from 32 bits down to 16 bits.  When both REX.W and
+   0x66 are present there is a conflict, and REX.W takes precedence.
+
+   Rather than try to handle this complexity using a single huge
+   function, several smaller ones are provided.  The aim is to make it
+   as difficult as possible to screw up register decoding in a subtle
+   and hard-to-track-down way.
+
+   Because these routines fish around in the host's memory (that is,
+   in the guest state area) for sub-parts of guest registers, their
+   correctness depends on the host's endianness.  So far these
+   routines only work for little-endian hosts.  Those for which
+   endianness is important have assertions to ensure sanity.
+*/
+
+
+/* About the simplest question you can ask: where do the 64-bit
+   integer registers live (in the guest state) ? */
+
+static Int integerGuestReg64Offset ( UInt reg )
+{
+   switch (reg) {
+      case R_RAX: return OFFB_RAX;
+      case R_RCX: return OFFB_RCX;
+      case R_RDX: return OFFB_RDX;
+      case R_RBX: return OFFB_RBX;
+      case R_RSP: return OFFB_RSP;
+      case R_RBP: return OFFB_RBP;
+      case R_RSI: return OFFB_RSI;
+      case R_RDI: return OFFB_RDI;
+      case R_R8:  return OFFB_R8;
+      case R_R9:  return OFFB_R9;
+      case R_R10: return OFFB_R10;
+      case R_R11: return OFFB_R11;
+      case R_R12: return OFFB_R12;
+      case R_R13: return OFFB_R13;
+      case R_R14: return OFFB_R14;
+      case R_R15: return OFFB_R15;
+      default: vpanic("integerGuestReg64Offset(amd64)");
+   }
+}
+
+
+/* Produce the name of an integer register, for printing purposes.
+   reg is a number in the range 0 .. 15 that has been generated from a
+   3-bit reg-field number and a REX extension bit.  irregular denotes
+   the case where sz==1 and no REX byte is present. */
+
+static 
+HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
+{
+   static HChar* ireg64_names[16]
+     = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
+         "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
+   static HChar* ireg32_names[16]
+     = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
+         "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
+   static HChar* ireg16_names[16]
+     = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
+         "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
+   static HChar* ireg8_names[16]
+     = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
+         "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
+   static HChar* ireg8_irregular[8] 
+     = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
+
+   vassert(reg < 16);
+   if (sz == 1) {
+      if (irregular)
+         vassert(reg < 8);
+   } else {
+      vassert(irregular == False);
+   }
+
+   switch (sz) {
+      case 8: return ireg64_names[reg];
+      case 4: return ireg32_names[reg];
+      case 2: return ireg16_names[reg];
+      case 1: if (irregular) {
+                 return ireg8_irregular[reg];
+              } else {
+                 return ireg8_names[reg];
+              }
+      default: vpanic("nameIReg(amd64)");
+   }
+}
+
+/* Using the same argument conventions as nameIReg, produce the
+   guest state offset of an integer register. */
+
+static 
+Int offsetIReg ( Int sz, UInt reg, Bool irregular )
+{
+   vassert(reg < 16);
+   if (sz == 1) {
+      if (irregular)
+         vassert(reg < 8);
+   } else {
+      vassert(irregular == False);
+   }
+
+   /* Deal with irregular case -- sz==1 and no REX present */
+   if (sz == 1 && irregular) {
+      switch (reg) {
+         case R_RSP: return 1+ OFFB_RAX;
+         case R_RBP: return 1+ OFFB_RCX;
+         case R_RSI: return 1+ OFFB_RDX;
+         case R_RDI: return 1+ OFFB_RBX;
+         default:    break; /* use the normal case */
+      }
+   }
+
+   /* Normal case */
+   return integerGuestReg64Offset(reg);
+}
+
+
+/* Read the %CL register :: Ity_I8, for shift/rotate operations. */
+
+static IRExpr* getIRegCL ( void )
+{
+   vassert(!host_is_bigendian);
+   return IRExpr_Get( OFFB_RCX, Ity_I8 );
+}
+
+
+/* Write to the %AH register. */
+
+static void putIRegAH ( IRExpr* e )
+{
+   vassert(!host_is_bigendian);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
+   stmt( IRStmt_Put( OFFB_RAX+1, e ) );
+}
+
+
+/* Read/write various widths of %RAX, as it has various
+   special-purpose uses. */
+
+static HChar* nameIRegRAX ( Int sz )
+{
+   switch (sz) {
+      case 1: return "%al";
+      case 2: return "%ax";
+      case 4: return "%eax";
+      case 8: return "%rax";
+      default: vpanic("nameIRegRAX(amd64)");
+   }
+}
+
+static IRExpr* getIRegRAX ( Int sz )
+{
+   vassert(!host_is_bigendian);
+   switch (sz) {
+      case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
+      case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
+      case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
+      case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
+      default: vpanic("getIRegRAX(amd64)");
+   }
+}
+
+static void putIRegRAX ( Int sz, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(irsb->tyenv, e);
+   vassert(!host_is_bigendian);
+   switch (sz) {
+      case 8: vassert(ty == Ity_I64);
+              stmt( IRStmt_Put( OFFB_RAX, e ));
+              break;
+      case 4: vassert(ty == Ity_I32);
+              stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
+              break;
+      case 2: vassert(ty == Ity_I16);
+              stmt( IRStmt_Put( OFFB_RAX, e ));
+              break;
+      case 1: vassert(ty == Ity_I8);
+              stmt( IRStmt_Put( OFFB_RAX, e ));
+              break;
+      default: vpanic("putIRegRAX(amd64)");
+   }
+}
+
+
+/* Read/write various widths of %RDX, as it has various
+   special-purpose uses. */
+
+static HChar* nameIRegRDX ( Int sz )
+{
+   switch (sz) {
+      case 1: return "%dl";
+      case 2: return "%dx";
+      case 4: return "%edx";
+      case 8: return "%rdx";
+      default: vpanic("nameIRegRDX(amd64)");
+   }
+}
+
+static IRExpr* getIRegRDX ( Int sz )
+{
+   vassert(!host_is_bigendian);
+   switch (sz) {
+      case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
+      case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
+      case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
+      case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
+      default: vpanic("getIRegRDX(amd64)");
+   }
+}
+
+static void putIRegRDX ( Int sz, IRExpr* e )
+{
+   vassert(!host_is_bigendian);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
+   switch (sz) {
+      case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
+              break;
+      case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
+              break;
+      case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
+              break;
+      case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
+              break;
+      default: vpanic("putIRegRDX(amd64)");
+   }
+}
+
+
+/* Simplistic functions to deal with the integer registers as a
+   straightforward bank of 16 64-bit regs. */
+
+static IRExpr* getIReg64 ( UInt regno )
+{
+   return IRExpr_Get( integerGuestReg64Offset(regno),
+                      Ity_I64 );
+}
+
+static void putIReg64 ( UInt regno, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
+   stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
+}
+
+static HChar* nameIReg64 ( UInt regno )
+{
+   return nameIReg( 8, regno, False );
+}
+
+
+/* Simplistic functions to deal with the lower halves of integer
+   registers as a straightforward bank of 16 32-bit regs. */
+
+static IRExpr* getIReg32 ( UInt regno )
+{
+   vassert(!host_is_bigendian);
+   return unop(Iop_64to32,
+               IRExpr_Get( integerGuestReg64Offset(regno),
+                           Ity_I64 ));
+}
+
+static void putIReg32 ( UInt regno, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
+   stmt( IRStmt_Put( integerGuestReg64Offset(regno), 
+                     unop(Iop_32Uto64,e) ) );
+}
+
+static HChar* nameIReg32 ( UInt regno )
+{
+   return nameIReg( 4, regno, False );
+}
+
+
+/* Simplistic functions to deal with the lower quarters of integer
+   registers as a straightforward bank of 16 16-bit regs. */
+
+static IRExpr* getIReg16 ( UInt regno )
+{
+   vassert(!host_is_bigendian);
+   return IRExpr_Get( integerGuestReg64Offset(regno),
+                      Ity_I16 );
+}
+
+static HChar* nameIReg16 ( UInt regno )
+{
+   return nameIReg( 2, regno, False );
+}
+
+
+/* Sometimes what we know is a 3-bit register number, a REX byte, and
+   which field of the REX byte is to be used to extend to a 4-bit
+   number.  These functions cater for that situation.  
+*/
+static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
+{
+   vassert(lo3bits < 8);
+   vassert(IS_VALID_PFX(pfx));
+   return getIReg64( lo3bits | (getRexX(pfx) << 3) );
+}
+
+static HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
+{
+   vassert(lo3bits < 8);
+   vassert(IS_VALID_PFX(pfx));
+   return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
+}
+
+static HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
+{
+   vassert(lo3bits < 8);
+   vassert(IS_VALID_PFX(pfx));
+   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
+   return nameIReg( sz, lo3bits | (getRexB(pfx) << 3), 
+                        toBool(sz==1 && !haveREX(pfx)) );
+}
+
+static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
+{
+   vassert(lo3bits < 8);
+   vassert(IS_VALID_PFX(pfx));
+   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
+   if (sz == 4) {
+      sz = 8;
+      return unop(Iop_64to32,
+                  IRExpr_Get(
+                     offsetIReg( sz, lo3bits | (getRexB(pfx) << 3), 
+                                     toBool(sz==1 && !haveREX(pfx)) ),
+                     szToITy(sz)
+                 )
+             );
+   } else {
+      return IRExpr_Get(
+                offsetIReg( sz, lo3bits | (getRexB(pfx) << 3), 
+                                toBool(sz==1 && !haveREX(pfx)) ),
+                szToITy(sz)
+             );
+   }
+}
+
+static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
+{
+   vassert(lo3bits < 8);
+   vassert(IS_VALID_PFX(pfx));
+   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
+   stmt( IRStmt_Put( 
+            offsetIReg( sz, lo3bits | (getRexB(pfx) << 3), 
+                            toBool(sz==1 && !haveREX(pfx)) ),
+            sz==4 ? unop(Iop_32Uto64,e) : e
+   ));
+}
+
+
+/* Functions for getting register numbers from modrm bytes and REX
+   when we don't have to consider the complexities of integer subreg
+   accesses.
+*/
+/* Extract the g reg field from a modRM byte, and augment it using the
+   REX.R bit from the supplied REX byte.  The R bit usually is
+   associated with the g register field.
+*/
+static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
+{
+   Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
+   reg += (pfx & PFX_REXR) ? 8 : 0;
+   return reg;
+}
+
+/* Extract the e reg field from a modRM byte, and augment it using the
+   REX.B bit from the supplied REX byte.  The B bit usually is
+   associated with the e register field (when modrm indicates e is a
+   register, that is).
+*/
+static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
+{
+   Int rm;
+   vassert(epartIsReg(mod_reg_rm));
+   rm = (Int)(mod_reg_rm & 0x7);
+   rm += (pfx & PFX_REXB) ? 8 : 0;
+   return rm;
+}
+
+
+/* General functions for dealing with integer register access. */
+
+/* Produce the guest state offset for a reference to the 'g' register
+   field in a modrm byte, taking into account REX (or its absence),
+   and the size of the access.
+*/
+static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
+{
+   UInt reg;
+   vassert(!host_is_bigendian);
+   vassert(IS_VALID_PFX(pfx));
+   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
+   reg = gregOfRexRM( pfx, mod_reg_rm );
+   return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
+}
+
+static 
+IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
+{
+   if (sz == 4) {
+      sz = 8;
+      return unop(Iop_64to32,
+                  IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
+                              szToITy(sz) ));
+   } else {
+      return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
+                         szToITy(sz) );
+   }
+}
+
+static 
+void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
+   if (sz == 4) {
+      e = unop(Iop_32Uto64,e);
+   }
+   stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
+}
+
+static
+HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
+{
+   return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
+                        toBool(sz==1 && !haveREX(pfx)) );
+}
+
+
+/* Produce the guest state offset for a reference to the 'e' register
+   field in a modrm byte, taking into account REX (or its absence),
+   and the size of the access.  eregOfRexRM will assert if mod_reg_rm
+   denotes a memory access rather than a register access.
+*/
+static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
+{
+   UInt reg;
+   vassert(!host_is_bigendian);
+   vassert(IS_VALID_PFX(pfx));
+   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
+   reg = eregOfRexRM( pfx, mod_reg_rm );
+   return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
+}
+
+static 
+IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
+{
+   if (sz == 4) {
+      sz = 8;
+      return unop(Iop_64to32,
+                  IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
+                              szToITy(sz) ));
+   } else {
+      return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
+                         szToITy(sz) );
+   }
+}
+
+static 
+void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
+   if (sz == 4) {
+      e = unop(Iop_32Uto64,e);
+   }
+   stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
+}
+
+static
+HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
+{
+   return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
+                        toBool(sz==1 && !haveREX(pfx)) );
+}
+
+
+/*------------------------------------------------------------*/
+/*--- For dealing with XMM registers                       ---*/
+/*------------------------------------------------------------*/
+
+//.. static Int segmentGuestRegOffset ( UInt sreg )
+//.. {
+//..    switch (sreg) {
+//..       case R_ES: return OFFB_ES;
+//..       case R_CS: return OFFB_CS;
+//..       case R_SS: return OFFB_SS;
+//..       case R_DS: return OFFB_DS;
+//..       case R_FS: return OFFB_FS;
+//..       case R_GS: return OFFB_GS;
+//..       default: vpanic("segmentGuestRegOffset(x86)");
+//..    }
+//.. }
+
+static Int xmmGuestRegOffset ( UInt xmmreg )
+{
+   switch (xmmreg) {
+      case 0:  return OFFB_XMM0;
+      case 1:  return OFFB_XMM1;
+      case 2:  return OFFB_XMM2;
+      case 3:  return OFFB_XMM3;
+      case 4:  return OFFB_XMM4;
+      case 5:  return OFFB_XMM5;
+      case 6:  return OFFB_XMM6;
+      case 7:  return OFFB_XMM7;
+      case 8:  return OFFB_XMM8;
+      case 9:  return OFFB_XMM9;
+      case 10: return OFFB_XMM10;
+      case 11: return OFFB_XMM11;
+      case 12: return OFFB_XMM12;
+      case 13: return OFFB_XMM13;
+      case 14: return OFFB_XMM14;
+      case 15: return OFFB_XMM15;
+      default: vpanic("xmmGuestRegOffset(amd64)");
+   }
+}
+
+/* Lanes of vector registers are always numbered from zero being the
+   least significant lane (rightmost in the register).  */
+
+static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
+{
+   /* Correct for little-endian host only. */
+   vassert(!host_is_bigendian);
+   vassert(laneno >= 0 && laneno < 8);
+   return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
+}
+
+static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
+{
+   /* Correct for little-endian host only. */
+   vassert(!host_is_bigendian);
+   vassert(laneno >= 0 && laneno < 4);
+   return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
+}
+
+static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
+{
+   /* Correct for little-endian host only. */
+   vassert(!host_is_bigendian);
+   vassert(laneno >= 0 && laneno < 2);
+   return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
+}
+
+//.. static IRExpr* getSReg ( UInt sreg )
+//.. {
+//..    return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
+//.. }
+//.. 
+//.. static void putSReg ( UInt sreg, IRExpr* e )
+//.. {
+//..    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
+//..    stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
+//.. }
+
+static IRExpr* getXMMReg ( UInt xmmreg )
+{
+   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
+}
+
+static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
+{
+   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
+}
+
+static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
+{
+   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
+}
+
+static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
+{
+   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
+}
+
+static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
+{
+   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
+}
+
+static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
+{
+  return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
+}
+
+static void putXMMReg ( UInt xmmreg, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
+   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
+}
+
+static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
+   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
+}
+
+static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
+   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
+}
+
+static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
+   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
+}
+
+static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
+   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
+}
+
+static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
+   stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
+}
+
+static IRExpr* mkV128 ( UShort mask )
+{
+   return IRExpr_Const(IRConst_V128(mask));
+}
+
+static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
+   vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
+   return unop(Iop_64to1, 
+               binop(Iop_And64, 
+                     unop(Iop_1Uto64,x), 
+                     unop(Iop_1Uto64,y)));
+}
+
+/* Generate a compare-and-swap operation, operating on memory at
+   'addr'.  The expected value is 'expVal' and the new value is
+   'newVal'.  If the operation fails, then transfer control (with a
+   no-redir jump (XXX no -- see comment at top of this file)) to
+   'restart_point', which is presumably the address of the guest
+   instruction again -- retrying, essentially. */
+static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
+                    Addr64 restart_point )
+{
+   IRCAS* cas;
+   IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
+   IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
+   IRTemp oldTmp = newTemp(tyE);
+   IRTemp expTmp = newTemp(tyE);
+   vassert(tyE == tyN);
+   vassert(tyE == Ity_I64 || tyE == Ity_I32
+           || tyE == Ity_I16 || tyE == Ity_I8);
+   assign(expTmp, expVal);
+   cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr, 
+                  NULL, mkexpr(expTmp), NULL, newVal );
+   stmt( IRStmt_CAS(cas) );
+   stmt( IRStmt_Exit(
+            binop( mkSizedOp(tyE,Iop_CasCmpNE8),
+                   mkexpr(oldTmp), mkexpr(expTmp) ),
+            Ijk_Boring, /*Ijk_NoRedir*/
+            IRConst_U64( restart_point ) 
+         ));
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for %rflags.                                 ---*/
+/*------------------------------------------------------------*/
+
+/* -------------- Evaluating the flags-thunk. -------------- */
+
+/* Build IR to calculate all the eflags from stored
+   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
+   Ity_I64. */
+static IRExpr* mk_amd64g_calculate_rflags_all ( void )
+{
+   IRExpr** args
+      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
+                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
+                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
+                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
+   IRExpr* call
+      = mkIRExprCCall(
+           Ity_I64,
+           0/*regparm*/, 
+           "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
+           args
+        );
+   /* Exclude OP and NDEP from definedness checking.  We're only
+      interested in DEP1 and DEP2. */
+   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
+   return call;
+}
+
+/* Build IR to calculate some particular condition from stored
+   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
+   Ity_Bit. */
+static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
+{
+   IRExpr** args
+      = mkIRExprVec_5( mkU64(cond),
+                       IRExpr_Get(OFFB_CC_OP,   Ity_I64),
+                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
+                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
+                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
+   IRExpr* call
+      = mkIRExprCCall(
+           Ity_I64,
+           0/*regparm*/, 
+           "amd64g_calculate_condition", &amd64g_calculate_condition,
+           args
+        );
+   /* Exclude the requested condition, OP and NDEP from definedness
+      checking.  We're only interested in DEP1 and DEP2. */
+   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
+   return unop(Iop_64to1, call);
+}
+
+/* Build IR to calculate just the carry flag from stored
+   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
+static IRExpr* mk_amd64g_calculate_rflags_c ( void )
+{
+   IRExpr** args
+      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
+                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
+                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
+                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
+   IRExpr* call
+      = mkIRExprCCall(
+           Ity_I64,
+           0/*regparm*/, 
+           "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
+           args
+        );
+   /* Exclude OP and NDEP from definedness checking.  We're only
+      interested in DEP1 and DEP2. */
+   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
+   return call;
+}
+
+
+/* -------------- Building the flags-thunk. -------------- */
+
+/* The machinery in this section builds the flag-thunk following a
+   flag-setting operation.  Hence the various setFlags_* functions.
+*/
+
+static Bool isAddSub ( IROp op8 )
+{
+   return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
+}
+
+static Bool isLogic ( IROp op8 )
+{
+   return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
+}
+
+/* U-widen 8/16/32/64 bit int expr to 64. */
+static IRExpr* widenUto64 ( IRExpr* e )
+{
+   switch (typeOfIRExpr(irsb->tyenv,e)) {
+      case Ity_I64: return e;
+      case Ity_I32: return unop(Iop_32Uto64, e);
+      case Ity_I16: return unop(Iop_16Uto64, e);
+      case Ity_I8:  return unop(Iop_8Uto64, e);
+      default: vpanic("widenUto64");
+   }
+}
+
+/* S-widen 8/16/32/64 bit int expr to 32. */
+static IRExpr* widenSto64 ( IRExpr* e )
+{
+   switch (typeOfIRExpr(irsb->tyenv,e)) {
+      case Ity_I64: return e;
+      case Ity_I32: return unop(Iop_32Sto64, e);
+      case Ity_I16: return unop(Iop_16Sto64, e);
+      case Ity_I8:  return unop(Iop_8Sto64, e);
+      default: vpanic("widenSto64");
+   }
+}
+
+/* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
+   of these combinations make sense. */
+static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
+{
+   IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
+   if (src_ty == dst_ty)
+      return e;
+   if (src_ty == Ity_I32 && dst_ty == Ity_I16)
+      return unop(Iop_32to16, e);
+   if (src_ty == Ity_I32 && dst_ty == Ity_I8)
+      return unop(Iop_32to8, e);
+   if (src_ty == Ity_I64 && dst_ty == Ity_I32)
+      return unop(Iop_64to32, e);
+   if (src_ty == Ity_I64 && dst_ty == Ity_I16)
+      return unop(Iop_64to16, e);
+   if (src_ty == Ity_I64 && dst_ty == Ity_I8)
+      return unop(Iop_64to8, e);
+
+   vex_printf("\nsrc, dst tys are: ");
+   ppIRType(src_ty);
+   vex_printf(", ");
+   ppIRType(dst_ty);
+   vex_printf("\n");
+   vpanic("narrowTo(amd64)");
+}
+
+
+/* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
+   auto-sized up to the real op. */
+
+static 
+void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
+{
+   Int ccOp = 0;
+   switch (ty) {
+      case Ity_I8:  ccOp = 0; break;
+      case Ity_I16: ccOp = 1; break;
+      case Ity_I32: ccOp = 2; break;
+      case Ity_I64: ccOp = 3; break;
+      default: vassert(0);
+   }
+   switch (op8) {
+      case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
+      case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
+      default:       ppIROp(op8);
+                     vpanic("setFlags_DEP1_DEP2(amd64)");
+   }
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
+   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
+}
+
+
+/* Set the OP and DEP1 fields only, and write zero to DEP2. */
+
+static 
+void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
+{
+   Int ccOp = 0;
+   switch (ty) {
+      case Ity_I8:  ccOp = 0; break;
+      case Ity_I16: ccOp = 1; break;
+      case Ity_I32: ccOp = 2; break;
+      case Ity_I64: ccOp = 3; break;
+      default: vassert(0);
+   }
+   switch (op8) {
+      case Iop_Or8:
+      case Iop_And8:
+      case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
+      default:       ppIROp(op8);
+                     vpanic("setFlags_DEP1(amd64)");
+   }
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
+}
+
+
+/* For shift operations, we put in the result and the undershifted
+   result.  Except if the shift amount is zero, the thunk is left
+   unchanged. */
+
+static void setFlags_DEP1_DEP2_shift ( IROp    op64,
+                                       IRTemp  res,
+                                       IRTemp  resUS,
+                                       IRType  ty,
+                                       IRTemp  guard )
+{
+   Int ccOp = 0;
+   switch (ty) {
+      case Ity_I8:  ccOp = 0; break;
+      case Ity_I16: ccOp = 1; break;
+      case Ity_I32: ccOp = 2; break;
+      case Ity_I64: ccOp = 3; break;
+      default: vassert(0);
+   }
+
+   vassert(guard);
+
+   /* Both kinds of right shifts are handled by the same thunk
+      operation. */
+   switch (op64) {
+      case Iop_Shr64:
+      case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
+      case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
+      default:        ppIROp(op64);
+                      vpanic("setFlags_DEP1_DEP2_shift(amd64)");
+   }
+
+   /* DEP1 contains the result, DEP2 contains the undershifted value. */
+   stmt( IRStmt_Put( OFFB_CC_OP,
+                     IRExpr_Mux0X( mkexpr(guard),
+                                   IRExpr_Get(OFFB_CC_OP,Ity_I64),
+                                   mkU64(ccOp))) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1,
+                     IRExpr_Mux0X( mkexpr(guard),
+                                   IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
+                                   widenUto64(mkexpr(res)))) );
+   stmt( IRStmt_Put( OFFB_CC_DEP2, 
+                     IRExpr_Mux0X( mkexpr(guard),
+                                   IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
+                                   widenUto64(mkexpr(resUS)))) );
+}
+
+
+/* For the inc/dec case, we store in DEP1 the result value and in NDEP
+   the former value of the carry flag, which unfortunately we have to
+   compute. */
+
+static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
+{
+   Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
+
+   switch (ty) {
+      case Ity_I8:  ccOp += 0; break;
+      case Ity_I16: ccOp += 1; break;
+      case Ity_I32: ccOp += 2; break;
+      case Ity_I64: ccOp += 3; break;
+      default: vassert(0);
+   }
+   
+   /* This has to come first, because calculating the C flag 
+      may require reading all four thunk fields. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
+}
+
+
+/* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
+   two arguments. */
+
+static
+void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
+{
+   switch (ty) {
+      case Ity_I8:
+         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
+         break;
+      case Ity_I16:
+         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
+         break;
+      case Ity_I32:
+         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
+         break;
+      case Ity_I64:
+         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
+         break;
+      default:
+         vpanic("setFlags_MUL(amd64)");
+   }
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
+}
+
+
+/* -------------- Condition codes. -------------- */
+
+/* Condition codes, using the AMD encoding.  */
+
+static HChar* name_AMD64Condcode ( AMD64Condcode cond )
+{
+   switch (cond) {
+      case AMD64CondO:      return "o";
+      case AMD64CondNO:     return "no";
+      case AMD64CondB:      return "b";
+      case AMD64CondNB:     return "ae"; /*"nb";*/
+      case AMD64CondZ:      return "e"; /*"z";*/
+      case AMD64CondNZ:     return "ne"; /*"nz";*/
+      case AMD64CondBE:     return "be";
+      case AMD64CondNBE:    return "a"; /*"nbe";*/
+      case AMD64CondS:      return "s";
+      case AMD64CondNS:     return "ns";
+      case AMD64CondP:      return "p";
+      case AMD64CondNP:     return "np";
+      case AMD64CondL:      return "l";
+      case AMD64CondNL:     return "ge"; /*"nl";*/
+      case AMD64CondLE:     return "le";
+      case AMD64CondNLE:    return "g"; /*"nle";*/
+      case AMD64CondAlways: return "ALWAYS";
+      default: vpanic("name_AMD64Condcode");
+   }
+}
+
+static 
+AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
+                                          /*OUT*/Bool*   needInvert )
+{
+   vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
+   if (cond & 1) {
+      *needInvert = True;
+      return cond-1;
+   } else {
+      *needInvert = False;
+      return cond;
+   }
+}
+
+
+/* -------------- Helpers for ADD/SUB with carry. -------------- */
+
+/* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
+   appropriately.
+
+   Optionally, generate a store for the 'tres' value.  This can either
+   be a normal store, or it can be a cas-with-possible-failure style
+   store:
+
+   if taddr is IRTemp_INVALID, then no store is generated.
+
+   if taddr is not IRTemp_INVALID, then a store (using taddr as
+   the address) is generated:
+
+     if texpVal is IRTemp_INVALID then a normal store is
+     generated, and restart_point must be zero (it is irrelevant).
+
+     if texpVal is not IRTemp_INVALID then a cas-style store is
+     generated.  texpVal is the expected value, restart_point
+     is the restart point if the store fails, and texpVal must
+     have the same type as tres.   
+
+*/
+static void helper_ADC ( Int sz,
+                         IRTemp tres, IRTemp ta1, IRTemp ta2,
+                         /* info about optional store: */
+                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
+{
+   UInt    thunkOp;
+   IRType  ty    = szToITy(sz);
+   IRTemp  oldc  = newTemp(Ity_I64);
+   IRTemp  oldcn = newTemp(ty);
+   IROp    plus  = mkSizedOp(ty, Iop_Add8);
+   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
+
+   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
+
+   switch (sz) {
+      case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
+      case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
+      case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
+      case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
+      default: vassert(0);
+   }
+
+   /* oldc = old carry flag, 0 or 1 */
+   assign( oldc,  binop(Iop_And64,
+                        mk_amd64g_calculate_rflags_c(),
+                        mkU64(1)) );
+
+   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
+
+   assign( tres, binop(plus,
+                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
+                       mkexpr(oldcn)) );
+
+   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
+      start of this function. */
+   if (taddr != IRTemp_INVALID) {
+      if (texpVal == IRTemp_INVALID) {
+         vassert(restart_point == 0);
+         storeLE( mkexpr(taddr), mkexpr(tres) );
+      } else {
+         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
+         /* .. and hence 'texpVal' has the same type as 'tres'. */
+         casLE( mkexpr(taddr),
+                mkexpr(texpVal), mkexpr(tres), restart_point );
+      }
+   }
+
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2), 
+                                                         mkexpr(oldcn)) )) );
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
+}
+
+
+/* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
+   appropriately.  As with helper_ADC, possibly generate a store of
+   the result -- see comments on helper_ADC for details.
+*/
+static void helper_SBB ( Int sz,
+                         IRTemp tres, IRTemp ta1, IRTemp ta2,
+                         /* info about optional store: */
+                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
+{
+   UInt    thunkOp;
+   IRType  ty    = szToITy(sz);
+   IRTemp  oldc  = newTemp(Ity_I64);
+   IRTemp  oldcn = newTemp(ty);
+   IROp    minus = mkSizedOp(ty, Iop_Sub8);
+   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
+
+   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
+
+   switch (sz) {
+      case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
+      case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
+      case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
+      case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
+      default: vassert(0);
+   }
+
+   /* oldc = old carry flag, 0 or 1 */
+   assign( oldc, binop(Iop_And64,
+                       mk_amd64g_calculate_rflags_c(),
+                       mkU64(1)) );
+
+   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
+
+   assign( tres, binop(minus,
+                       binop(minus,mkexpr(ta1),mkexpr(ta2)),
+                       mkexpr(oldcn)) );
+
+   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
+      start of this function. */
+   if (taddr != IRTemp_INVALID) {
+      if (texpVal == IRTemp_INVALID) {
+         vassert(restart_point == 0);
+         storeLE( mkexpr(taddr), mkexpr(tres) );
+      } else {
+         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
+         /* .. and hence 'texpVal' has the same type as 'tres'. */
+         casLE( mkexpr(taddr),
+                mkexpr(texpVal), mkexpr(tres), restart_point );
+      }
+   }
+
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
+   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2), 
+                                                         mkexpr(oldcn)) )) );
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
+}
+
+
+/* -------------- Helpers for disassembly printing. -------------- */
+
+static HChar* nameGrp1 ( Int opc_aux )
+{
+   static HChar* grp1_names[8] 
+     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
+   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
+   return grp1_names[opc_aux];
+}
+
+static HChar* nameGrp2 ( Int opc_aux )
+{
+   static HChar* grp2_names[8] 
+     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
+   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
+   return grp2_names[opc_aux];
+}
+
+static HChar* nameGrp4 ( Int opc_aux )
+{
+   static HChar* grp4_names[8] 
+     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
+   if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
+   return grp4_names[opc_aux];
+}
+
+static HChar* nameGrp5 ( Int opc_aux )
+{
+   static HChar* grp5_names[8] 
+     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
+   if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
+   return grp5_names[opc_aux];
+}
+
+static HChar* nameGrp8 ( Int opc_aux )
+{
+   static HChar* grp8_names[8] 
+      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
+   if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
+   return grp8_names[opc_aux];
+}
+
+//.. static HChar* nameSReg ( UInt sreg )
+//.. {
+//..    switch (sreg) {
+//..       case R_ES: return "%es";
+//..       case R_CS: return "%cs";
+//..       case R_SS: return "%ss";
+//..       case R_DS: return "%ds";
+//..       case R_FS: return "%fs";
+//..       case R_GS: return "%gs";
+//..       default: vpanic("nameSReg(x86)");
+//..    }
+//.. }
+
+static HChar* nameMMXReg ( Int mmxreg )
+{
+   static HChar* mmx_names[8] 
+     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
+   if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
+   return mmx_names[mmxreg];
+}
+
+static HChar* nameXMMReg ( Int xmmreg )
+{
+   static HChar* xmm_names[16] 
+     = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3", 
+         "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7", 
+         "%xmm8",  "%xmm9",  "%xmm10", "%xmm11", 
+         "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
+   if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
+   return xmm_names[xmmreg];
+}
+ 
+static HChar* nameMMXGran ( Int gran )
+{
+   switch (gran) {
+      case 0: return "b";
+      case 1: return "w";
+      case 2: return "d";
+      case 3: return "q";
+      default: vpanic("nameMMXGran(amd64,guest)");
+   }
+}
+
+static HChar nameISize ( Int size )
+{
+   switch (size) {
+      case 8: return 'q';
+      case 4: return 'l';
+      case 2: return 'w';
+      case 1: return 'b';
+      default: vpanic("nameISize(amd64)");
+   }
+}
+
+
+/*------------------------------------------------------------*/
+/*--- JMP helpers                                          ---*/
+/*------------------------------------------------------------*/
+
+static void jmp_lit( IRJumpKind kind, Addr64 d64 )
+{
+   irsb->next     = mkU64(d64);
+   irsb->jumpkind = kind;
+}
+
+static void jmp_treg( IRJumpKind kind, IRTemp t )
+{
+   irsb->next     = mkexpr(t);
+   irsb->jumpkind = kind;
+}
+
+static 
+void jcc_01 ( AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
+{
+   Bool          invert;
+   AMD64Condcode condPos;
+   condPos = positiveIse_AMD64Condcode ( cond, &invert );
+   if (invert) {
+      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
+                         Ijk_Boring,
+                         IRConst_U64(d64_false) ) );
+      irsb->next     = mkU64(d64_true);
+      irsb->jumpkind = Ijk_Boring;
+   } else {
+      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
+                         Ijk_Boring,
+                         IRConst_U64(d64_true) ) );
+      irsb->next     = mkU64(d64_false);
+      irsb->jumpkind = Ijk_Boring;
+   }
+}
+
+/* Let new_rsp be the %rsp value after a call/return.  Let nia be the
+   guest address of the next instruction to be executed.
+
+   This function generates an AbiHint to say that -128(%rsp)
+   .. -1(%rsp) should now be regarded as uninitialised.
+*/
+static 
+void make_redzone_AbiHint ( VexAbiInfo* vbi,
+                            IRTemp new_rsp, IRTemp nia, HChar* who )
+{
+   Int szB = vbi->guest_stack_redzone_size;
+   vassert(szB >= 0);
+
+   /* A bit of a kludge.  Currently the only AbI we've guested AMD64
+      for is ELF.  So just check it's the expected 128 value
+      (paranoia). */
+   vassert(szB == 128);
+
+   if (0) vex_printf("AbiHint: %s\n", who);
+   vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
+   vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
+   if (szB > 0)
+      stmt( IRStmt_AbiHint( 
+               binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)), 
+               szB,
+               mkexpr(nia)
+            ));
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Disassembling addressing modes                       ---*/
+/*------------------------------------------------------------*/
+
+static 
+HChar* segRegTxt ( Prefix pfx )
+{
+   if (pfx & PFX_CS) return "%cs:";
+   if (pfx & PFX_DS) return "%ds:";
+   if (pfx & PFX_ES) return "%es:";
+   if (pfx & PFX_FS) return "%fs:";
+   if (pfx & PFX_GS) return "%gs:";
+   if (pfx & PFX_SS) return "%ss:";
+   return ""; /* no override */
+}
+
+
+/* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
+   linear address by adding any required segment override as indicated
+   by sorb, and also dealing with any address size override
+   present. */
+static
+IRExpr* handleAddrOverrides ( VexAbiInfo* vbi, 
+                              Prefix pfx, IRExpr* virtual )
+{
+   /* --- segment overrides --- */
+   if (pfx & PFX_FS) {
+      if (vbi->guest_amd64_assume_fs_is_zero) {
+         /* Note that this is a linux-kernel specific hack that relies
+            on the assumption that %fs is always zero. */
+         /* return virtual + guest_FS_ZERO. */
+         virtual = binop(Iop_Add64, virtual,
+                                    IRExpr_Get(OFFB_FS_ZERO, Ity_I64));
+      } else {
+         unimplemented("amd64 %fs segment override");
+      }
+   }
+
+   if (pfx & PFX_GS) {
+      if (vbi->guest_amd64_assume_gs_is_0x60) {
+         /* Note that this is a darwin-kernel specific hack that relies
+            on the assumption that %gs is always 0x60. */
+         /* return virtual + guest_GS_0x60. */
+         virtual = binop(Iop_Add64, virtual,
+                                    IRExpr_Get(OFFB_GS_0x60, Ity_I64));
+      } else {
+         unimplemented("amd64 %gs segment override");
+      }
+   }
+
+   /* cs, ds, es and ss are simply ignored in 64-bit mode. */
+
+   /* --- address size override --- */
+   if (haveASO(pfx))
+      virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
+
+   return virtual;
+}
+
+//.. {
+//..    Int    sreg;
+//..    IRType hWordTy;
+//..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
+//.. 
+//..    if (sorb == 0)
+//..       /* the common case - no override */
+//..       return virtual;
+//.. 
+//..    switch (sorb) {
+//..       case 0x3E: sreg = R_DS; break;
+//..       case 0x26: sreg = R_ES; break;
+//..       case 0x64: sreg = R_FS; break;
+//..       case 0x65: sreg = R_GS; break;
+//..       default: vpanic("handleAddrOverrides(x86,guest)");
+//..    }
+//.. 
+//..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
+//.. 
+//..    seg_selector = newTemp(Ity_I32);
+//..    ldt_ptr      = newTemp(hWordTy);
+//..    gdt_ptr      = newTemp(hWordTy);
+//..    r64          = newTemp(Ity_I64);
+//.. 
+//..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
+//..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
+//..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
+//.. 
+//..    /*
+//..    Call this to do the translation and limit checks: 
+//..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
+//..                                  UInt seg_selector, UInt virtual_addr )
+//..    */
+//..    assign( 
+//..       r64, 
+//..       mkIRExprCCall( 
+//..          Ity_I64, 
+//..          0/*regparms*/, 
+//..          "x86g_use_seg_selector", 
+//..          &x86g_use_seg_selector, 
+//..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr), 
+//..                         mkexpr(seg_selector), virtual)
+//..       )
+//..    );
+//.. 
+//..    /* If the high 32 of the result are non-zero, there was a 
+//..       failure in address translation.  In which case, make a
+//..       quick exit.
+//..    */
+//..    stmt( 
+//..       IRStmt_Exit(
+//..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
+//..          Ijk_MapFail,
+//..          IRConst_U32( guest_eip_curr_instr )
+//..       )
+//..    );
+//.. 
+//..    /* otherwise, here's the translated result. */
+//..    return unop(Iop_64to32, mkexpr(r64));
+//.. }
+
+
+/* Generate IR to calculate an address indicated by a ModRM and
+   following SIB bytes.  The expression, and the number of bytes in
+   the address mode, are returned (the latter in *len).  Note that
+   this fn should not be called if the R/M part of the address denotes
+   a register instead of memory.  If print_codegen is true, text of
+   the addressing mode is placed in buf.
+
+   The computed address is stored in a new tempreg, and the
+   identity of the tempreg is returned.
+
+   extra_bytes holds the number of bytes after the amode, as supplied
+   by the caller.  This is needed to make sense of %rip-relative
+   addresses.  Note that the value that *len is set to is only the
+   length of the amode itself and does not include the value supplied
+   in extra_bytes.
+ */
+
+static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
+{
+   IRTemp tmp = newTemp(Ity_I64);
+   assign( tmp, addr64 );
+   return tmp;
+}
+
+static 
+IRTemp disAMode ( /*OUT*/Int* len,
+                  VexAbiInfo* vbi, Prefix pfx, Long delta, 
+                  /*OUT*/HChar* buf, Int extra_bytes )
+{
+   UChar mod_reg_rm = getUChar(delta);
+   delta++;
+
+   buf[0] = (UChar)0;
+   vassert(extra_bytes >= 0 && extra_bytes < 10);
+
+   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
+      jump table seems a bit excessive. 
+   */
+   mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
+   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
+                                               /* is now XX0XXYYY */
+   mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
+   switch (mod_reg_rm) {
+
+      /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
+         REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
+      */
+      case 0x00: case 0x01: case 0x02: case 0x03: 
+      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
+         { UChar rm = toUChar(mod_reg_rm & 7);
+           DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
+           *len = 1;
+           return disAMode_copy2tmp(
+                  handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
+         }
+
+      /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp) 
+         REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12) 
+      */
+      case 0x08: case 0x09: case 0x0A: case 0x0B: 
+      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
+         { UChar rm = toUChar(mod_reg_rm & 7);
+           Long d   = getSDisp8(delta);
+           if (d == 0) {
+              DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
+           } else {
+              DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
+           }
+           *len = 2;
+           return disAMode_copy2tmp(
+                  handleAddrOverrides(vbi, pfx,
+                     binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
+         }
+
+      /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
+         REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
+      */
+      case 0x10: case 0x11: case 0x12: case 0x13: 
+      /* ! 14 */ case 0x15: case 0x16: case 0x17:
+         { UChar rm = toUChar(mod_reg_rm & 7);
+           Long  d  = getSDisp32(delta);
+           DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
+           *len = 5;
+           return disAMode_copy2tmp(
+                  handleAddrOverrides(vbi, pfx,
+                     binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
+         }
+
+      /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
+      /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
+      case 0x18: case 0x19: case 0x1A: case 0x1B:
+      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
+         vpanic("disAMode(amd64): not an addr!");
+
+      /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
+         correctly at the start of handling each instruction. */
+      case 0x05: 
+         { Long d = getSDisp32(delta);
+           *len = 5;
+           DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
+           /* We need to know the next instruction's start address.
+              Try and figure out what it is, record the guess, and ask
+              the top-level driver logic (bbToIR_AMD64) to check we
+              guessed right, after the instruction is completely
+              decoded. */
+           guest_RIP_next_mustcheck = True;
+           guest_RIP_next_assumed = guest_RIP_bbstart 
+                                    + delta+4 + extra_bytes;
+           return disAMode_copy2tmp( 
+                     handleAddrOverrides(vbi, pfx, 
+                        binop(Iop_Add64, mkU64(guest_RIP_next_assumed), 
+                                         mkU64(d))));
+         }
+
+      case 0x04: {
+         /* SIB, with no displacement.  Special cases:
+            -- %rsp cannot act as an index value.  
+               If index_r indicates %rsp, zero is used for the index.
+            -- when mod is zero and base indicates RBP or R13, base is 
+               instead a 32-bit sign-extended literal.
+            It's all madness, I tell you.  Extract %index, %base and 
+            scale from the SIB byte.  The value denoted is then:
+               | %index == %RSP && (%base == %RBP || %base == %R13)
+               = d32 following SIB byte
+               | %index == %RSP && !(%base == %RBP || %base == %R13)
+               = %base
+               | %index != %RSP && (%base == %RBP || %base == %R13)
+               = d32 following SIB byte + (%index << scale)
+               | %index != %RSP && !(%base == %RBP || %base == %R13)
+               = %base + (%index << scale)
+         */
+         UChar sib     = getUChar(delta);
+         UChar scale   = toUChar((sib >> 6) & 3);
+         UChar index_r = toUChar((sib >> 3) & 7);
+         UChar base_r  = toUChar(sib & 7);
+         /* correct since #(R13) == 8 + #(RBP) */
+         Bool  base_is_BPor13 = toBool(base_r == R_RBP);
+         Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
+         delta++;
+
+         if ((!index_is_SP) && (!base_is_BPor13)) {
+            if (scale == 0) {
+               DIS(buf, "%s(%s,%s)", segRegTxt(pfx), 
+                         nameIRegRexB(8,pfx,base_r), 
+                         nameIReg64rexX(pfx,index_r));
+            } else {
+               DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx), 
+                         nameIRegRexB(8,pfx,base_r), 
+                         nameIReg64rexX(pfx,index_r), 1<<scale);
+            }
+            *len = 2;
+            return
+               disAMode_copy2tmp( 
+               handleAddrOverrides(vbi, pfx,
+                  binop(Iop_Add64, 
+                        getIRegRexB(8,pfx,base_r),
+                        binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
+                              mkU8(scale)))));
+         }
+
+         if ((!index_is_SP) && base_is_BPor13) {
+            Long d = getSDisp32(delta);
+            DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, 
+                      nameIReg64rexX(pfx,index_r), 1<<scale);
+            *len = 6;
+            return
+               disAMode_copy2tmp(
+               handleAddrOverrides(vbi, pfx, 
+                  binop(Iop_Add64,
+                        binop(Iop_Shl64, getIReg64rexX(pfx,index_r), 
+                                         mkU8(scale)),
+                        mkU64(d))));
+         }
+
+         if (index_is_SP && (!base_is_BPor13)) {
+            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
+            *len = 2;
+            return disAMode_copy2tmp(
+                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
+         }
+
+         if (index_is_SP && base_is_BPor13) {
+            Long d = getSDisp32(delta);
+            DIS(buf, "%s%lld", segRegTxt(pfx), d);
+            *len = 6;
+            return disAMode_copy2tmp(
+                   handleAddrOverrides(vbi, pfx, mkU64(d)));
+         }
+
+         vassert(0);
+      }
+
+      /* SIB, with 8-bit displacement.  Special cases:
+         -- %esp cannot act as an index value.  
+            If index_r indicates %esp, zero is used for the index.
+         Denoted value is:
+            | %index == %ESP
+            = d8 + %base
+            | %index != %ESP
+            = d8 + %base + (%index << scale)
+      */
+      case 0x0C: {
+         UChar sib     = getUChar(delta);
+         UChar scale   = toUChar((sib >> 6) & 3);
+         UChar index_r = toUChar((sib >> 3) & 7);
+         UChar base_r  = toUChar(sib & 7);
+         Long d        = getSDisp8(delta+1);
+
+         if (index_r == R_RSP && 0==getRexX(pfx)) {
+            DIS(buf, "%s%lld(%s)", segRegTxt(pfx), 
+                                   d, nameIRegRexB(8,pfx,base_r));
+            *len = 3;
+            return disAMode_copy2tmp(
+                   handleAddrOverrides(vbi, pfx, 
+                      binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
+         } else {
+            if (scale == 0) {
+               DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d, 
+                         nameIRegRexB(8,pfx,base_r), 
+                         nameIReg64rexX(pfx,index_r));
+            } else {
+               DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d, 
+                         nameIRegRexB(8,pfx,base_r), 
+                         nameIReg64rexX(pfx,index_r), 1<<scale);
+            }
+            *len = 3;
+            return 
+                disAMode_copy2tmp(
+                handleAddrOverrides(vbi, pfx,
+                  binop(Iop_Add64,
+                        binop(Iop_Add64, 
+                              getIRegRexB(8,pfx,base_r), 
+                              binop(Iop_Shl64, 
+                                    getIReg64rexX(pfx,index_r), mkU8(scale))),
+                        mkU64(d))));
+         }
+         vassert(0); /*NOTREACHED*/
+      }
+
+      /* SIB, with 32-bit displacement.  Special cases:
+         -- %rsp cannot act as an index value.  
+            If index_r indicates %rsp, zero is used for the index.
+         Denoted value is:
+            | %index == %RSP
+            = d32 + %base
+            | %index != %RSP
+            = d32 + %base + (%index << scale)
+      */
+      case 0x14: {
+         UChar sib     = getUChar(delta);
+         UChar scale   = toUChar((sib >> 6) & 3);
+         UChar index_r = toUChar((sib >> 3) & 7);
+         UChar base_r  = toUChar(sib & 7);
+         Long d        = getSDisp32(delta+1);
+
+         if (index_r == R_RSP && 0==getRexX(pfx)) {
+            DIS(buf, "%s%lld(%s)", segRegTxt(pfx), 
+                                   d, nameIRegRexB(8,pfx,base_r));
+            *len = 6;
+            return disAMode_copy2tmp(
+                   handleAddrOverrides(vbi, pfx, 
+                      binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
+         } else {
+            if (scale == 0) {
+               DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d, 
+                         nameIRegRexB(8,pfx,base_r), 
+                         nameIReg64rexX(pfx,index_r));
+            } else {
+               DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d, 
+                         nameIRegRexB(8,pfx,base_r), 
+                         nameIReg64rexX(pfx,index_r), 1<<scale);
+            }
+            *len = 6;
+            return 
+                disAMode_copy2tmp(
+                handleAddrOverrides(vbi, pfx,
+                  binop(Iop_Add64,
+                        binop(Iop_Add64, 
+                              getIRegRexB(8,pfx,base_r), 
+                              binop(Iop_Shl64, 
+                                    getIReg64rexX(pfx,index_r), mkU8(scale))),
+                        mkU64(d))));
+         }
+         vassert(0); /*NOTREACHED*/
+      }
+
+      default:
+         vpanic("disAMode(amd64)");
+         return 0; /*notreached*/
+   }
+}
+
+
+/* Figure out the number of (insn-stream) bytes constituting the amode
+   beginning at delta.  Is useful for getting hold of literals beyond
+   the end of the amode before it has been disassembled.  */
+
+static UInt lengthAMode ( Prefix pfx, Long delta )
+{
+   UChar mod_reg_rm = getUChar(delta);
+   delta++;
+
+   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
+      jump table seems a bit excessive. 
+   */
+   mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
+   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
+                                               /* is now XX0XXYYY */
+   mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
+   switch (mod_reg_rm) {
+
+      /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
+         REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
+      */
+      case 0x00: case 0x01: case 0x02: case 0x03: 
+      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
+         return 1;
+
+      /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp) 
+         REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12) 
+      */
+      case 0x08: case 0x09: case 0x0A: case 0x0B: 
+      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
+         return 2;
+
+      /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
+         REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
+      */
+      case 0x10: case 0x11: case 0x12: case 0x13: 
+      /* ! 14 */ case 0x15: case 0x16: case 0x17:
+         return 5;
+
+      /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
+      /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
+      /* Not an address, but still handled. */
+      case 0x18: case 0x19: case 0x1A: case 0x1B:
+      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
+         return 1;
+
+      /* RIP + disp32. */
+      case 0x05: 
+         return 5;
+
+      case 0x04: {
+         /* SIB, with no displacement. */
+         UChar sib     = getUChar(delta);
+         UChar base_r  = toUChar(sib & 7);
+         /* correct since #(R13) == 8 + #(RBP) */
+         Bool  base_is_BPor13 = toBool(base_r == R_RBP);
+
+         if (base_is_BPor13) {
+            return 6;
+         } else {
+            return 2;
+         }
+      }
+
+      /* SIB, with 8-bit displacement. */
+      case 0x0C:
+         return 3;
+
+      /* SIB, with 32-bit displacement. */
+      case 0x14:
+         return 6;
+
+      default:
+         vpanic("lengthAMode(amd64)");
+         return 0; /*notreached*/
+   }
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Disassembling common idioms                          ---*/
+/*------------------------------------------------------------*/
+
+/* Handle binary integer instructions of the form
+      op E, G  meaning
+      op reg-or-mem, reg
+   Is passed the a ptr to the modRM byte, the actual operation, and the
+   data size.  Returns the address advanced completely over this
+   instruction.
+
+   E(src) is reg-or-mem
+   G(dst) is reg.
+
+   If E is reg, -->    GET %G,  tmp
+                       OP %E,   tmp
+                       PUT tmp, %G
+ 
+   If E is mem and OP is not reversible, 
+                -->    (getAddr E) -> tmpa
+                       LD (tmpa), tmpa
+                       GET %G, tmp2
+                       OP tmpa, tmp2
+                       PUT tmp2, %G
+
+   If E is mem and OP is reversible
+                -->    (getAddr E) -> tmpa
+                       LD (tmpa), tmpa
+                       OP %G, tmpa
+                       PUT tmpa, %G
+*/
+static
+ULong dis_op2_E_G ( VexAbiInfo* vbi,
+                    Prefix      pfx,
+                    Bool        addSubCarry,
+                    IROp        op8, 
+                    Bool        keep,
+                    Int         size, 
+                    Long        delta0,
+                    HChar*      t_amd64opc )
+{
+   HChar   dis_buf[50];
+   Int     len;
+   IRType  ty   = szToITy(size);
+   IRTemp  dst1 = newTemp(ty);
+   IRTemp  src  = newTemp(ty);
+   IRTemp  dst0 = newTemp(ty);
+   UChar   rm   = getUChar(delta0);
+   IRTemp  addr = IRTemp_INVALID;
+
+   /* addSubCarry == True indicates the intended operation is
+      add-with-carry or subtract-with-borrow. */
+   if (addSubCarry) {
+      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
+      vassert(keep);
+   }
+
+   if (epartIsReg(rm)) {
+      /* Specially handle XOR reg,reg, because that doesn't really
+         depend on reg, and doing the obvious thing potentially
+         generates a spurious value check failure due to the bogus
+         dependency. */
+      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
+          && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
+         if (False && op8 == Iop_Sub8)
+            vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
+	 putIRegG(size,pfx,rm, mkU(ty,0));
+      }
+
+      assign( dst0, getIRegG(size,pfx,rm) );
+      assign( src,  getIRegE(size,pfx,rm) );
+
+      if (addSubCarry && op8 == Iop_Add8) {
+         helper_ADC( size, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+         putIRegG(size, pfx, rm, mkexpr(dst1));
+      } else
+      if (addSubCarry && op8 == Iop_Sub8) {
+         helper_SBB( size, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+         putIRegG(size, pfx, rm, mkexpr(dst1));
+      } else {
+         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
+         if (isAddSub(op8))
+            setFlags_DEP1_DEP2(op8, dst0, src, ty);
+         else
+            setFlags_DEP1(op8, dst1, ty);
+         if (keep)
+            putIRegG(size, pfx, rm, mkexpr(dst1));
+      }
+
+      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size), 
+                          nameIRegE(size,pfx,rm),
+                          nameIRegG(size,pfx,rm));
+      return 1+delta0;
+   } else {
+      /* E refers to memory */
+      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
+      assign( dst0, getIRegG(size,pfx,rm) );
+      assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
+
+      if (addSubCarry && op8 == Iop_Add8) {
+         helper_ADC( size, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+         putIRegG(size, pfx, rm, mkexpr(dst1));
+      } else
+      if (addSubCarry && op8 == Iop_Sub8) {
+         helper_SBB( size, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+         putIRegG(size, pfx, rm, mkexpr(dst1));
+      } else {
+         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
+         if (isAddSub(op8))
+            setFlags_DEP1_DEP2(op8, dst0, src, ty);
+         else
+            setFlags_DEP1(op8, dst1, ty);
+         if (keep)
+            putIRegG(size, pfx, rm, mkexpr(dst1));
+      }
+
+      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size), 
+                          dis_buf, nameIRegG(size, pfx, rm));
+      return len+delta0;
+   }
+}
+
+
+
+/* Handle binary integer instructions of the form
+      op G, E  meaning
+      op reg, reg-or-mem
+   Is passed the a ptr to the modRM byte, the actual operation, and the
+   data size.  Returns the address advanced completely over this
+   instruction.
+
+   G(src) is reg.
+   E(dst) is reg-or-mem
+
+   If E is reg, -->    GET %E,  tmp
+                       OP %G,   tmp
+                       PUT tmp, %E
+ 
+   If E is mem, -->    (getAddr E) -> tmpa
+                       LD (tmpa), tmpv
+                       OP %G, tmpv
+                       ST tmpv, (tmpa)
+*/
+static
+ULong dis_op2_G_E ( VexAbiInfo* vbi,
+                    Prefix      pfx,
+                    Bool        addSubCarry,
+                    IROp        op8, 
+                    Bool        keep,
+                    Int         size, 
+                    Long        delta0,
+                    HChar*      t_amd64opc )
+{
+   HChar   dis_buf[50];
+   Int     len;
+   IRType  ty   = szToITy(size);
+   IRTemp  dst1 = newTemp(ty);
+   IRTemp  src  = newTemp(ty);
+   IRTemp  dst0 = newTemp(ty);
+   UChar   rm   = getUChar(delta0);
+   IRTemp  addr = IRTemp_INVALID;
+
+   /* addSubCarry == True indicates the intended operation is
+      add-with-carry or subtract-with-borrow. */
+   if (addSubCarry) {
+      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
+      vassert(keep);
+   }
+
+   if (epartIsReg(rm)) {
+      /* Specially handle XOR reg,reg, because that doesn't really
+         depend on reg, and doing the obvious thing potentially
+         generates a spurious value check failure due to the bogus
+         dependency.  Ditto SBB reg,reg. */
+      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
+          && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
+         putIRegE(size,pfx,rm, mkU(ty,0));
+      }
+
+      assign(dst0, getIRegE(size,pfx,rm));
+      assign(src,  getIRegG(size,pfx,rm));
+
+      if (addSubCarry && op8 == Iop_Add8) {
+         helper_ADC( size, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+         putIRegE(size, pfx, rm, mkexpr(dst1));
+      } else
+      if (addSubCarry && op8 == Iop_Sub8) {
+         helper_SBB( size, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+         putIRegE(size, pfx, rm, mkexpr(dst1));
+      } else {
+         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
+         if (isAddSub(op8))
+            setFlags_DEP1_DEP2(op8, dst0, src, ty);
+         else
+            setFlags_DEP1(op8, dst1, ty);
+         if (keep)
+            putIRegE(size, pfx, rm, mkexpr(dst1));
+      }
+
+      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size), 
+                          nameIRegG(size,pfx,rm),
+                          nameIRegE(size,pfx,rm));
+      return 1+delta0;
+   }
+
+   /* E refers to memory */    
+   {
+      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
+      assign(dst0, loadLE(ty,mkexpr(addr)));
+      assign(src,  getIRegG(size,pfx,rm));
+
+      if (addSubCarry && op8 == Iop_Add8) {
+         if (pfx & PFX_LOCK) {
+            /* cas-style store */
+            helper_ADC( size, dst1, dst0, src,
+                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
+         } else {
+            /* normal store */
+            helper_ADC( size, dst1, dst0, src,
+                        /*store*/addr, IRTemp_INVALID, 0 );
+         }
+      } else
+      if (addSubCarry && op8 == Iop_Sub8) {
+         if (pfx & PFX_LOCK) {
+            /* cas-style store */
+            helper_SBB( size, dst1, dst0, src,
+                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
+         } else {
+            /* normal store */
+            helper_SBB( size, dst1, dst0, src,
+                        /*store*/addr, IRTemp_INVALID, 0 );
+         }
+      } else {
+         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
+         if (keep) {
+            if (pfx & PFX_LOCK) {
+               if (0) vex_printf("locked case\n" );
+               casLE( mkexpr(addr),
+                      mkexpr(dst0)/*expval*/, 
+                      mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
+            } else {
+               if (0) vex_printf("nonlocked case\n");
+               storeLE(mkexpr(addr), mkexpr(dst1));
+            }
+         }
+         if (isAddSub(op8))
+            setFlags_DEP1_DEP2(op8, dst0, src, ty);
+         else
+            setFlags_DEP1(op8, dst1, ty);
+      }
+
+      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size), 
+                          nameIRegG(size,pfx,rm), dis_buf);
+      return len+delta0;
+   }
+}
+
+
+/* Handle move instructions of the form
+      mov E, G  meaning
+      mov reg-or-mem, reg
+   Is passed the a ptr to the modRM byte, and the data size.  Returns
+   the address advanced completely over this instruction.
+
+   E(src) is reg-or-mem
+   G(dst) is reg.
+
+   If E is reg, -->    GET %E,  tmpv
+                       PUT tmpv, %G
+ 
+   If E is mem  -->    (getAddr E) -> tmpa
+                       LD (tmpa), tmpb
+                       PUT tmpb, %G
+*/
+static
+ULong dis_mov_E_G ( VexAbiInfo* vbi,
+                    Prefix      pfx,
+                    Int         size, 
+                    Long        delta0 )
+{
+   Int len;
+   UChar rm = getUChar(delta0);
+   HChar dis_buf[50];
+
+   if (epartIsReg(rm)) {
+      putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
+      DIP("mov%c %s,%s\n", nameISize(size),
+                           nameIRegE(size,pfx,rm),
+                           nameIRegG(size,pfx,rm));
+      return 1+delta0;
+   }
+
+   /* E refers to memory */    
+   {
+      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
+      putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
+      DIP("mov%c %s,%s\n", nameISize(size), 
+                           dis_buf,
+                           nameIRegG(size,pfx,rm));
+      return delta0+len;
+   }
+}
+
+
+/* Handle move instructions of the form
+      mov G, E  meaning
+      mov reg, reg-or-mem
+   Is passed the a ptr to the modRM byte, and the data size.  Returns
+   the address advanced completely over this instruction.
+
+   G(src) is reg.
+   E(dst) is reg-or-mem
+
+   If E is reg, -->    GET %G,  tmp
+                       PUT tmp, %E
+ 
+   If E is mem, -->    (getAddr E) -> tmpa
+                       GET %G, tmpv
+                       ST tmpv, (tmpa) 
+*/
+static
+ULong dis_mov_G_E ( VexAbiInfo* vbi,
+                    Prefix      pfx,
+                    Int         size, 
+                    Long        delta0 )
+{
+   Int len;
+   UChar rm = getUChar(delta0);
+   HChar dis_buf[50];
+
+   if (epartIsReg(rm)) {
+      putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
+      DIP("mov%c %s,%s\n", nameISize(size),
+                           nameIRegG(size,pfx,rm),
+                           nameIRegE(size,pfx,rm));
+      return 1+delta0;
+   }
+
+   /* E refers to memory */    
+   {
+      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
+      storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
+      DIP("mov%c %s,%s\n", nameISize(size), 
+                           nameIRegG(size,pfx,rm), 
+                           dis_buf);
+      return len+delta0;
+   }
+}
+
+
+/* op $immediate, AL/AX/EAX/RAX. */
+static
+ULong dis_op_imm_A ( Int    size,
+                     Bool   carrying,
+                     IROp   op8,
+                     Bool   keep,
+                     Long   delta,
+                     HChar* t_amd64opc )
+{
+   Int    size4 = imin(size,4);
+   IRType ty    = szToITy(size);
+   IRTemp dst0  = newTemp(ty);
+   IRTemp src   = newTemp(ty);
+   IRTemp dst1  = newTemp(ty);
+   Long  lit    = getSDisp(size4,delta);
+   assign(dst0, getIRegRAX(size));
+   assign(src,  mkU(ty,lit & mkSizeMask(size)));
+
+   if (isAddSub(op8) && !carrying) {
+      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
+      setFlags_DEP1_DEP2(op8, dst0, src, ty);
+   }
+   else
+   if (isLogic(op8)) {
+      vassert(!carrying);
+      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
+      setFlags_DEP1(op8, dst1, ty);
+   }
+   else
+   if (op8 == Iop_Add8 && carrying) {
+      helper_ADC( size, dst1, dst0, src,
+                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+   }
+   else
+   if (op8 == Iop_Sub8 && carrying) {
+      helper_SBB( size, dst1, dst0, src,
+                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+   }
+   else
+      vpanic("dis_op_imm_A(amd64,guest)");
+
+   if (keep)
+      putIRegRAX(size, mkexpr(dst1));
+
+   DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size), 
+                           lit, nameIRegRAX(size));
+   return delta+size4;
+}
+
+
+/* Sign- and Zero-extending moves. */
+static
+ULong dis_movx_E_G ( VexAbiInfo* vbi,
+                     Prefix pfx,
+                     Long delta, Int szs, Int szd, Bool sign_extend )
+{
+   UChar rm = getUChar(delta);
+   if (epartIsReg(rm)) {
+      putIRegG(szd, pfx, rm,
+                    doScalarWidening(
+                       szs,szd,sign_extend,
+                       getIRegE(szs,pfx,rm)));
+      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
+                               nameISize(szs), 
+                               nameISize(szd),
+                               nameIRegE(szs,pfx,rm),
+                               nameIRegG(szd,pfx,rm));
+      return 1+delta;
+   }
+
+   /* E refers to memory */    
+   {
+      Int    len;
+      HChar  dis_buf[50];
+      IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
+      putIRegG(szd, pfx, rm,
+                    doScalarWidening(
+                       szs,szd,sign_extend, 
+                       loadLE(szToITy(szs),mkexpr(addr))));
+      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
+                               nameISize(szs), 
+                               nameISize(szd),
+                               dis_buf, 
+                               nameIRegG(szd,pfx,rm));
+      return len+delta;
+   }
+}
+
+
+/* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
+   the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
+static
+void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
+{
+   /* special-case the 64-bit case */
+   if (sz == 8) {
+      IROp   op     = signed_divide ? Iop_DivModS128to64 
+                                    : Iop_DivModU128to64;
+      IRTemp src128 = newTemp(Ity_I128);
+      IRTemp dst128 = newTemp(Ity_I128);
+      assign( src128, binop(Iop_64HLto128, 
+                            getIReg64(R_RDX), 
+                            getIReg64(R_RAX)) );
+      assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
+      putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
+      putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
+   } else {
+      IROp   op    = signed_divide ? Iop_DivModS64to32 
+                                   : Iop_DivModU64to32;
+      IRTemp src64 = newTemp(Ity_I64);
+      IRTemp dst64 = newTemp(Ity_I64);
+      switch (sz) {
+      case 4:
+         assign( src64, 
+                 binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
+         assign( dst64, 
+                 binop(op, mkexpr(src64), mkexpr(t)) );
+         putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
+         putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
+         break;
+      case 2: {
+         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
+         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
+         assign( src64, unop(widen3264,
+                             binop(Iop_16HLto32, 
+                                   getIRegRDX(2), 
+                                   getIRegRAX(2))) );
+         assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
+         putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
+         putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
+         break;
+      }
+      case 1: {
+         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
+         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
+         IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
+         assign( src64, unop(widen3264, 
+                        unop(widen1632, getIRegRAX(2))) );
+         assign( dst64, 
+                 binop(op, mkexpr(src64), 
+                           unop(widen1632, unop(widen816, mkexpr(t)))) );
+         putIRegRAX( 1, unop(Iop_16to8, 
+                        unop(Iop_32to16,
+                        unop(Iop_64to32,mkexpr(dst64)))) );
+         putIRegAH( unop(Iop_16to8, 
+                    unop(Iop_32to16,
+                    unop(Iop_64HIto32,mkexpr(dst64)))) );
+         break;
+      }
+      default: 
+         vpanic("codegen_div(amd64)");
+      }
+   }
+}
+
+static 
+ULong dis_Grp1 ( VexAbiInfo* vbi,
+                 Prefix pfx,
+                 Long delta, UChar modrm, 
+                 Int am_sz, Int d_sz, Int sz, Long d64 )
+{
+   Int     len;
+   HChar   dis_buf[50];
+   IRType  ty   = szToITy(sz);
+   IRTemp  dst1 = newTemp(ty);
+   IRTemp  src  = newTemp(ty);
+   IRTemp  dst0 = newTemp(ty);
+   IRTemp  addr = IRTemp_INVALID;
+   IROp    op8  = Iop_INVALID;
+   ULong   mask = mkSizeMask(sz);
+
+   switch (gregLO3ofRM(modrm)) {
+      case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
+      case 2: break;  // ADC
+      case 3: break;  // SBB
+      case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
+      case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
+      /*NOTREACHED*/
+      default: vpanic("dis_Grp1(amd64): unhandled case");
+   }
+
+   if (epartIsReg(modrm)) {
+      vassert(am_sz == 1);
+
+      assign(dst0, getIRegE(sz,pfx,modrm));
+      assign(src,  mkU(ty,d64 & mask));
+
+      if (gregLO3ofRM(modrm) == 2 /* ADC */) {
+         helper_ADC( sz, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+      } else 
+      if (gregLO3ofRM(modrm) == 3 /* SBB */) {
+         helper_SBB( sz, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+      } else {
+         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
+         if (isAddSub(op8))
+            setFlags_DEP1_DEP2(op8, dst0, src, ty);
+         else
+            setFlags_DEP1(op8, dst1, ty);
+      }
+
+      if (gregLO3ofRM(modrm) < 7)
+         putIRegE(sz, pfx, modrm, mkexpr(dst1));
+
+      delta += (am_sz + d_sz);
+      DIP("%s%c $%lld, %s\n", 
+          nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64, 
+          nameIRegE(sz,pfx,modrm));
+   } else {
+      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
+
+      assign(dst0, loadLE(ty,mkexpr(addr)));
+      assign(src, mkU(ty,d64 & mask));
+
+      if (gregLO3ofRM(modrm) == 2 /* ADC */) {
+         if (pfx & PFX_LOCK) {
+            /* cas-style store */
+            helper_ADC( sz, dst1, dst0, src,
+                       /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
+         } else {
+            /* normal store */
+            helper_ADC( sz, dst1, dst0, src,
+                        /*store*/addr, IRTemp_INVALID, 0 );
+         }
+      } else 
+      if (gregLO3ofRM(modrm) == 3 /* SBB */) {
+         if (pfx & PFX_LOCK) {
+            /* cas-style store */
+            helper_SBB( sz, dst1, dst0, src,
+                       /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
+         } else {
+            /* normal store */
+            helper_SBB( sz, dst1, dst0, src,
+                        /*store*/addr, IRTemp_INVALID, 0 );
+         }
+      } else {
+         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
+         if (gregLO3ofRM(modrm) < 7) {
+            if (pfx & PFX_LOCK) {
+               casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/, 
+                                    mkexpr(dst1)/*newVal*/,
+                                    guest_RIP_curr_instr );
+            } else {
+               storeLE(mkexpr(addr), mkexpr(dst1));
+            }
+         }
+         if (isAddSub(op8))
+            setFlags_DEP1_DEP2(op8, dst0, src, ty);
+         else
+            setFlags_DEP1(op8, dst1, ty);
+      }
+
+      delta += (len+d_sz);
+      DIP("%s%c $%lld, %s\n", 
+          nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
+          d64, dis_buf);
+   }
+   return delta;
+}
+
+
+/* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
+   expression. */
+
+static
+ULong dis_Grp2 ( VexAbiInfo* vbi,
+                 Prefix pfx,
+                 Long delta, UChar modrm,
+                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
+                 HChar* shift_expr_txt, Bool* decode_OK )
+{
+   /* delta on entry points at the modrm byte. */
+   HChar  dis_buf[50];
+   Int    len;
+   Bool   isShift, isRotate, isRotateC;
+   IRType ty    = szToITy(sz);
+   IRTemp dst0  = newTemp(ty);
+   IRTemp dst1  = newTemp(ty);
+   IRTemp addr  = IRTemp_INVALID;
+
+   *decode_OK = True;
+
+   vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
+
+   /* Put value to shift/rotate in dst0. */
+   if (epartIsReg(modrm)) {
+      assign(dst0, getIRegE(sz, pfx, modrm));
+      delta += (am_sz + d_sz);
+   } else {
+      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
+      assign(dst0, loadLE(ty,mkexpr(addr)));
+      delta += len + d_sz;
+   }
+
+   isShift = False;
+   switch (gregLO3ofRM(modrm)) { case 4: case 5: case 7: isShift = True; }
+
+   isRotate = False;
+   switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
+
+   isRotateC = False;
+   switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
+
+   if (gregLO3ofRM(modrm) == 6) {
+      *decode_OK = False;
+      return delta;
+   }
+
+   if (!isShift && !isRotate && !isRotateC) {
+      /*NOTREACHED*/
+      vpanic("dis_Grp2(Reg): unhandled case(amd64)");
+   }
+
+   if (isRotateC) {
+      /* Call a helper; this insn is so ridiculous it does not deserve
+         better.  One problem is, the helper has to calculate both the
+         new value and the new flags.  This is more than 64 bits, and
+         there is no way to return more than 64 bits from the helper.
+         Hence the crude and obvious solution is to call it twice,
+         using the sign of the sz field to indicate whether it is the
+         value or rflags result we want.
+      */
+      Bool     left = toBool(gregLO3ofRM(modrm) == 2);
+      IRExpr** argsVALUE;
+      IRExpr** argsRFLAGS;
+
+      IRTemp new_value  = newTemp(Ity_I64);
+      IRTemp new_rflags = newTemp(Ity_I64);
+      IRTemp old_rflags = newTemp(Ity_I64);
+
+      assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
+
+      argsVALUE
+         = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
+                          widenUto64(shift_expr),   /* rotate amount */
+                          mkexpr(old_rflags),
+                          mkU64(sz) );
+      assign( new_value, 
+                 mkIRExprCCall(
+                    Ity_I64, 
+                    0/*regparm*/, 
+                    left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
+                    left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
+                    argsVALUE
+                 )
+            );
+      
+      argsRFLAGS
+         = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
+                          widenUto64(shift_expr),   /* rotate amount */
+                          mkexpr(old_rflags),
+                          mkU64(-sz) );
+      assign( new_rflags, 
+                 mkIRExprCCall(
+                    Ity_I64, 
+                    0/*regparm*/, 
+                    left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
+                    left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
+                    argsRFLAGS
+                 )
+            );
+
+      assign( dst1, narrowTo(ty, mkexpr(new_value)) );
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+   }
+
+   else
+   if (isShift) {
+
+      IRTemp pre64     = newTemp(Ity_I64);
+      IRTemp res64     = newTemp(Ity_I64);
+      IRTemp res64ss   = newTemp(Ity_I64);
+      IRTemp shift_amt = newTemp(Ity_I8);
+      UChar  mask      = toUChar(sz==8 ? 63 : 31);
+      IROp   op64;
+
+      switch (gregLO3ofRM(modrm)) { 
+         case 4: op64 = Iop_Shl64; break;
+         case 5: op64 = Iop_Shr64; break;
+         case 7: op64 = Iop_Sar64; break;
+         /*NOTREACHED*/
+         default: vpanic("dis_Grp2:shift"); break;
+      }
+
+      /* Widen the value to be shifted to 64 bits, do the shift, and
+         narrow back down.  This seems surprisingly long-winded, but
+         unfortunately the AMD semantics requires that 8/16/32-bit
+         shifts give defined results for shift values all the way up
+         to 32, and this seems the simplest way to do it.  It has the
+         advantage that the only IR level shifts generated are of 64
+         bit values, and the shift amount is guaranteed to be in the
+         range 0 .. 63, thereby observing the IR semantics requiring
+         all shift values to be in the range 0 .. 2^word_size-1. 
+
+         Therefore the shift amount is masked with 63 for 64-bit shifts
+         and 31 for all others.
+      */
+      /* shift_amt = shift_expr & MASK, regardless of operation size */
+      assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
+
+      /* suitably widen the value to be shifted to 64 bits. */
+      assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
+                                     : widenUto64(mkexpr(dst0)) );
+
+      /* res64 = pre64 `shift` shift_amt */
+      assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
+
+      /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
+      assign( res64ss,
+              binop(op64,
+                    mkexpr(pre64), 
+                    binop(Iop_And8,
+                          binop(Iop_Sub8,
+                                mkexpr(shift_amt), mkU8(1)),
+                          mkU8(mask))) );
+
+      /* Build the flags thunk. */
+      setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
+
+      /* Narrow the result back down. */
+      assign( dst1, narrowTo(ty, mkexpr(res64)) );
+
+   } /* if (isShift) */
+
+   else 
+   if (isRotate) {
+      Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 
+                                        : (ty==Ity_I32 ? 2 : 3));
+      Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
+      IRTemp rot_amt   = newTemp(Ity_I8);
+      IRTemp rot_amt64 = newTemp(Ity_I8);
+      IRTemp oldFlags  = newTemp(Ity_I64);
+      UChar  mask      = toUChar(sz==8 ? 63 : 31);
+
+      /* rot_amt = shift_expr & mask */
+      /* By masking the rotate amount thusly, the IR-level Shl/Shr
+         expressions never shift beyond the word size and thus remain
+         well defined. */
+      assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
+
+      if (ty == Ity_I64)
+         assign(rot_amt, mkexpr(rot_amt64));
+      else
+         assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
+
+      if (left) {
+
+         /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
+         assign(dst1, 
+            binop( mkSizedOp(ty,Iop_Or8),
+                   binop( mkSizedOp(ty,Iop_Shl8), 
+                          mkexpr(dst0),
+                          mkexpr(rot_amt)
+                   ),
+                   binop( mkSizedOp(ty,Iop_Shr8), 
+                          mkexpr(dst0), 
+                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
+                   )
+            )
+         );
+         ccOp += AMD64G_CC_OP_ROLB;
+
+      } else { /* right */
+
+         /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
+         assign(dst1, 
+            binop( mkSizedOp(ty,Iop_Or8),
+                   binop( mkSizedOp(ty,Iop_Shr8), 
+                          mkexpr(dst0),
+                          mkexpr(rot_amt)
+                   ),
+                   binop( mkSizedOp(ty,Iop_Shl8), 
+                          mkexpr(dst0), 
+                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
+                   )
+            )
+         );
+         ccOp += AMD64G_CC_OP_RORB;
+
+      }
+
+      /* dst1 now holds the rotated value.  Build flag thunk.  We
+         need the resulting value for this, and the previous flags.
+         Except don't set it if the rotate count is zero. */
+
+      assign(oldFlags, mk_amd64g_calculate_rflags_all());
+
+      /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
+      stmt( IRStmt_Put( OFFB_CC_OP,
+                        IRExpr_Mux0X( mkexpr(rot_amt64),
+                                      IRExpr_Get(OFFB_CC_OP,Ity_I64),
+                                      mkU64(ccOp))) );
+      stmt( IRStmt_Put( OFFB_CC_DEP1, 
+                        IRExpr_Mux0X( mkexpr(rot_amt64),
+                                      IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
+                                      widenUto64(mkexpr(dst1)))) );
+      stmt( IRStmt_Put( OFFB_CC_DEP2, 
+                        IRExpr_Mux0X( mkexpr(rot_amt64),
+                                      IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
+                                      mkU64(0))) );
+      stmt( IRStmt_Put( OFFB_CC_NDEP, 
+                        IRExpr_Mux0X( mkexpr(rot_amt64),
+                                      IRExpr_Get(OFFB_CC_NDEP,Ity_I64),
+                                      mkexpr(oldFlags))) );
+   } /* if (isRotate) */
+
+   /* Save result, and finish up. */
+   if (epartIsReg(modrm)) {
+      putIRegE(sz, pfx, modrm, mkexpr(dst1));
+      if (vex_traceflags & VEX_TRACE_FE) {
+         vex_printf("%s%c ",
+                    nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
+         if (shift_expr_txt)
+            vex_printf("%s", shift_expr_txt);
+         else
+            ppIRExpr(shift_expr);
+         vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
+      }
+   } else {
+      storeLE(mkexpr(addr), mkexpr(dst1));
+      if (vex_traceflags & VEX_TRACE_FE) {
+         vex_printf("%s%c ",
+                    nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
+         if (shift_expr_txt)
+            vex_printf("%s", shift_expr_txt);
+         else
+            ppIRExpr(shift_expr);
+         vex_printf(", %s\n", dis_buf);
+      }
+   }
+   return delta;
+}
+
+
+/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
+static
+ULong dis_Grp8_Imm ( VexAbiInfo* vbi,
+                     Prefix pfx,
+                     Long delta, UChar modrm,
+                     Int am_sz, Int sz, ULong src_val,
+                     Bool* decode_OK )
+{
+   /* src_val denotes a d8.
+      And delta on entry points at the modrm byte. */
+
+   IRType ty     = szToITy(sz);
+   IRTemp t2     = newTemp(Ity_I64);
+   IRTemp t2m    = newTemp(Ity_I64);
+   IRTemp t_addr = IRTemp_INVALID;
+   HChar  dis_buf[50];
+   ULong  mask;
+
+   /* we're optimists :-) */
+   *decode_OK = True;
+
+   /* Limit src_val -- the bit offset -- to something within a word.
+      The Intel docs say that literal offsets larger than a word are
+      masked in this way. */
+   switch (sz) {
+      case 2:  src_val &= 15; break;
+      case 4:  src_val &= 31; break;
+      case 8:  src_val &= 63; break;
+      default: *decode_OK = False; return delta;
+   }
+
+   /* Invent a mask suitable for the operation. */
+   switch (gregLO3ofRM(modrm)) {
+      case 4: /* BT */  mask = 0;                  break;
+      case 5: /* BTS */ mask = 1ULL << src_val;    break;
+      case 6: /* BTR */ mask = ~(1ULL << src_val); break;
+      case 7: /* BTC */ mask = 1ULL << src_val;    break;
+         /* If this needs to be extended, probably simplest to make a
+            new function to handle the other cases (0 .. 3).  The
+            Intel docs do however not indicate any use for 0 .. 3, so
+            we don't expect this to happen. */
+      default: *decode_OK = False; return delta;
+   }
+
+   /* Fetch the value to be tested and modified into t2, which is
+      64-bits wide regardless of sz. */
+   if (epartIsReg(modrm)) {
+      vassert(am_sz == 1);
+      assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
+      delta += (am_sz + 1);
+      DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)), 
+                                nameISize(sz),
+                                src_val, nameIRegE(sz,pfx,modrm));
+   } else {
+      Int len;
+      t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
+      delta  += (len+1);
+      assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
+      DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)), 
+                                nameISize(sz),
+                                src_val, dis_buf);
+   }
+
+   /* Compute the new value into t2m, if non-BT. */
+   switch (gregLO3ofRM(modrm)) {
+      case 4: /* BT */
+         break;
+      case 5: /* BTS */
+         assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
+         break;
+      case 6: /* BTR */
+         assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
+         break;
+      case 7: /* BTC */
+         assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
+         break;
+     default: 
+         /*NOTREACHED*/ /*the previous switch guards this*/
+         vassert(0);
+   }
+
+   /* Write the result back, if non-BT. */
+   if (gregLO3ofRM(modrm) != 4 /* BT */) {
+      if (epartIsReg(modrm)) {
+	putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
+      } else {
+         if (pfx & PFX_LOCK) {
+            casLE( mkexpr(t_addr),
+                   narrowTo(ty, mkexpr(t2))/*expd*/,
+                   narrowTo(ty, mkexpr(t2m))/*new*/,
+                   guest_RIP_curr_instr );
+         } else {
+            storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
+         }
+      }
+   }
+
+   /* Copy relevant bit from t2 into the carry flag. */
+   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+   stmt( IRStmt_Put( 
+            OFFB_CC_DEP1,
+            binop(Iop_And64,
+                  binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
+                  mkU64(1))
+       ));
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+
+   return delta;
+}
+
+
+/* Signed/unsigned widening multiply.  Generate IR to multiply the
+   value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
+   RDX:RAX/EDX:EAX/DX:AX/AX.
+*/
+static void codegen_mulL_A_D ( Int sz, Bool syned, 
+                               IRTemp tmp, HChar* tmp_txt )
+{
+   IRType ty = szToITy(sz);
+   IRTemp t1 = newTemp(ty);
+
+   assign( t1, getIRegRAX(sz) );
+
+   switch (ty) {
+      case Ity_I64: {
+         IRTemp res128  = newTemp(Ity_I128);
+         IRTemp resHi   = newTemp(Ity_I64);
+         IRTemp resLo   = newTemp(Ity_I64);
+         IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
+         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
+         setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
+         assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
+         assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
+         assign( resLo, unop(Iop_128to64,mkexpr(res128)));
+         putIReg64(R_RDX, mkexpr(resHi));
+         putIReg64(R_RAX, mkexpr(resLo));
+         break;
+      }
+      case Ity_I32: {
+         IRTemp res64   = newTemp(Ity_I64);
+         IRTemp resHi   = newTemp(Ity_I32);
+         IRTemp resLo   = newTemp(Ity_I32);
+         IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
+         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
+         setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
+         assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
+         assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
+         assign( resLo, unop(Iop_64to32,mkexpr(res64)));
+         putIRegRDX(4, mkexpr(resHi));
+         putIRegRAX(4, mkexpr(resLo));
+         break;
+      }
+      case Ity_I16: {
+         IRTemp res32   = newTemp(Ity_I32);
+         IRTemp resHi   = newTemp(Ity_I16);
+         IRTemp resLo   = newTemp(Ity_I16);
+         IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
+         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
+         setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
+         assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
+         assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
+         assign( resLo, unop(Iop_32to16,mkexpr(res32)));
+         putIRegRDX(2, mkexpr(resHi));
+         putIRegRAX(2, mkexpr(resLo));
+         break;
+      }
+      case Ity_I8: {
+         IRTemp res16   = newTemp(Ity_I16);
+         IRTemp resHi   = newTemp(Ity_I8);
+         IRTemp resLo   = newTemp(Ity_I8);
+         IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
+         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
+         setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
+         assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
+         assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
+         assign( resLo, unop(Iop_16to8,mkexpr(res16)));
+         putIRegRAX(2, mkexpr(res16));
+         break;
+      }
+      default:
+         ppIRType(ty);
+         vpanic("codegen_mulL_A_D(amd64)");
+   }
+   DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
+}
+
+
+/* Group 3 extended opcodes. */
+static 
+ULong dis_Grp3 ( VexAbiInfo* vbi, 
+                 Prefix pfx, Int sz, Long delta, Bool* decode_OK )
+{
+   Long    d64;
+   UChar   modrm;
+   HChar   dis_buf[50];
+   Int     len;
+   IRTemp  addr;
+   IRType  ty = szToITy(sz);
+   IRTemp  t1 = newTemp(ty);
+   IRTemp dst1, src, dst0;
+   *decode_OK = True;
+   modrm = getUChar(delta);
+   if (epartIsReg(modrm)) {
+      switch (gregLO3ofRM(modrm)) {
+         case 0: { /* TEST */
+            delta++; 
+            d64 = getSDisp(imin(4,sz), delta); 
+            delta += imin(4,sz);
+            dst1 = newTemp(ty);
+            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
+                               getIRegE(sz,pfx,modrm),
+                               mkU(ty, d64 & mkSizeMask(sz))));
+            setFlags_DEP1( Iop_And8, dst1, ty );
+            DIP("test%c $%lld, %s\n", 
+                nameISize(sz), d64, 
+                nameIRegE(sz, pfx, modrm));
+            break;
+         }
+         case 1:
+            *decode_OK = False;
+            return delta;
+         case 2: /* NOT */
+            delta++;
+            putIRegE(sz, pfx, modrm,
+                              unop(mkSizedOp(ty,Iop_Not8),
+                                   getIRegE(sz, pfx, modrm)));
+            DIP("not%c %s\n", nameISize(sz), 
+                              nameIRegE(sz, pfx, modrm));
+            break;
+         case 3: /* NEG */
+            delta++;
+            dst0 = newTemp(ty);
+            src  = newTemp(ty);
+            dst1 = newTemp(ty);
+            assign(dst0, mkU(ty,0));
+            assign(src,  getIRegE(sz, pfx, modrm));
+            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
+                                                       mkexpr(src)));
+            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
+            putIRegE(sz, pfx, modrm, mkexpr(dst1));
+            DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
+            break;
+         case 4: /* MUL (unsigned widening) */
+            delta++;
+            src = newTemp(ty);
+            assign(src, getIRegE(sz,pfx,modrm));
+            codegen_mulL_A_D ( sz, False, src,
+                               nameIRegE(sz,pfx,modrm) );
+            break;
+         case 5: /* IMUL (signed widening) */
+            delta++;
+            src = newTemp(ty);
+            assign(src, getIRegE(sz,pfx,modrm));
+            codegen_mulL_A_D ( sz, True, src,
+                               nameIRegE(sz,pfx,modrm) );
+            break;
+         case 6: /* DIV */
+            delta++;
+            assign( t1, getIRegE(sz, pfx, modrm) );
+            codegen_div ( sz, t1, False );
+            DIP("div%c %s\n", nameISize(sz), 
+                              nameIRegE(sz, pfx, modrm));
+            break;
+         case 7: /* IDIV */
+            delta++;
+            assign( t1, getIRegE(sz, pfx, modrm) );
+            codegen_div ( sz, t1, True );
+            DIP("idiv%c %s\n", nameISize(sz), 
+                               nameIRegE(sz, pfx, modrm));
+            break;
+         default: 
+            /*NOTREACHED*/
+            vpanic("Grp3(amd64,R)");
+      }
+   } else {
+      addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
+                        /* we have to inform disAMode of any immediate
+			   bytes used */
+                        gregLO3ofRM(modrm)==0/*TEST*/
+                           ? imin(4,sz)
+                           : 0
+                      );
+      t1   = newTemp(ty);
+      delta += len;
+      assign(t1, loadLE(ty,mkexpr(addr)));
+      switch (gregLO3ofRM(modrm)) {
+         case 0: { /* TEST */
+            d64 = getSDisp(imin(4,sz), delta); 
+            delta += imin(4,sz);
+            dst1 = newTemp(ty);
+            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
+                               mkexpr(t1), 
+                               mkU(ty, d64 & mkSizeMask(sz))));
+            setFlags_DEP1( Iop_And8, dst1, ty );
+            DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
+            break;
+         }
+         case 1:
+            *decode_OK = False;
+            return delta;
+         case 2: /* NOT */
+            dst1 = newTemp(ty);
+            assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
+            if (pfx & PFX_LOCK) {
+               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
+                                    guest_RIP_curr_instr );
+            } else {
+               storeLE( mkexpr(addr), mkexpr(dst1) );
+            }
+            DIP("not%c %s\n", nameISize(sz), dis_buf);
+            break;
+         case 3: /* NEG */
+            dst0 = newTemp(ty);
+            src  = newTemp(ty);
+            dst1 = newTemp(ty);
+            assign(dst0, mkU(ty,0));
+            assign(src,  mkexpr(t1));
+            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
+                                                       mkexpr(src)));
+            if (pfx & PFX_LOCK) {
+               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
+                                    guest_RIP_curr_instr );
+            } else {
+               storeLE( mkexpr(addr), mkexpr(dst1) );
+            }
+            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
+            DIP("neg%c %s\n", nameISize(sz), dis_buf);
+            break;
+         case 4: /* MUL (unsigned widening) */
+            codegen_mulL_A_D ( sz, False, t1, dis_buf );
+            break;
+         case 5: /* IMUL */
+            codegen_mulL_A_D ( sz, True, t1, dis_buf );
+            break;
+         case 6: /* DIV */
+            codegen_div ( sz, t1, False );
+            DIP("div%c %s\n", nameISize(sz), dis_buf);
+            break;
+         case 7: /* IDIV */
+            codegen_div ( sz, t1, True );
+            DIP("idiv%c %s\n", nameISize(sz), dis_buf);
+            break;
+         default: 
+            /*NOTREACHED*/
+            vpanic("Grp3(amd64,M)");
+      }
+   }
+   return delta;
+}
+
+
+/* Group 4 extended opcodes. */
+static
+ULong dis_Grp4 ( VexAbiInfo* vbi,
+                 Prefix pfx, Long delta, Bool* decode_OK )
+{
+   Int   alen;
+   UChar modrm;
+   HChar dis_buf[50];
+   IRType ty = Ity_I8;
+   IRTemp t1 = newTemp(ty);
+   IRTemp t2 = newTemp(ty);
+
+   *decode_OK = True;
+
+   modrm = getUChar(delta);
+   if (epartIsReg(modrm)) {
+      assign(t1, getIRegE(1, pfx, modrm));
+      switch (gregLO3ofRM(modrm)) {
+         case 0: /* INC */
+            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
+            putIRegE(1, pfx, modrm, mkexpr(t2));
+            setFlags_INC_DEC( True, t2, ty );
+            break;
+         case 1: /* DEC */
+            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
+            putIRegE(1, pfx, modrm, mkexpr(t2));
+            setFlags_INC_DEC( False, t2, ty );
+            break;
+         default: 
+            *decode_OK = False;
+            return delta;
+      }
+      delta++;
+      DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
+                      nameIRegE(1, pfx, modrm));
+   } else {
+      IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( t1, loadLE(ty, mkexpr(addr)) );
+      switch (gregLO3ofRM(modrm)) {
+         case 0: /* INC */
+            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
+            if (pfx & PFX_LOCK) {
+               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/, 
+                      guest_RIP_curr_instr );
+            } else {
+               storeLE( mkexpr(addr), mkexpr(t2) );
+            }
+            setFlags_INC_DEC( True, t2, ty );
+            break;
+         case 1: /* DEC */
+            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
+            if (pfx & PFX_LOCK) {
+               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/, 
+                      guest_RIP_curr_instr );
+            } else {
+               storeLE( mkexpr(addr), mkexpr(t2) );
+            }
+            setFlags_INC_DEC( False, t2, ty );
+            break;
+         default: 
+            *decode_OK = False;
+            return delta;
+      }
+      delta += alen;
+      DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
+   }
+   return delta;
+}
+
+
+/* Group 5 extended opcodes. */
+static
+ULong dis_Grp5 ( VexAbiInfo* vbi,
+                 Prefix pfx, Int sz, Long delta,
+                 DisResult* dres, Bool* decode_OK )
+{
+   Int     len;
+   UChar   modrm;
+   HChar   dis_buf[50];
+   IRTemp  addr = IRTemp_INVALID;
+   IRType  ty = szToITy(sz);
+   IRTemp  t1 = newTemp(ty);
+   IRTemp  t2 = IRTemp_INVALID;
+   IRTemp  t3 = IRTemp_INVALID;
+   Bool    showSz = True;
+
+   *decode_OK = True;
+
+   modrm = getUChar(delta);
+   if (epartIsReg(modrm)) {
+      assign(t1, getIRegE(sz,pfx,modrm));
+      switch (gregLO3ofRM(modrm)) {
+         case 0: /* INC */
+            t2 = newTemp(ty);
+            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
+                             mkexpr(t1), mkU(ty,1)));
+            setFlags_INC_DEC( True, t2, ty );
+            putIRegE(sz,pfx,modrm, mkexpr(t2));
+            break;
+         case 1: /* DEC */
+            t2 = newTemp(ty);
+            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
+                             mkexpr(t1), mkU(ty,1)));
+            setFlags_INC_DEC( False, t2, ty );
+            putIRegE(sz,pfx,modrm, mkexpr(t2));
+            break;
+         case 2: /* call Ev */
+            /* Ignore any sz value and operate as if sz==8. */
+            if (!(sz == 4 || sz == 8)) goto unhandled;
+            sz = 8;
+            t3 = newTemp(Ity_I64);
+            assign(t3, getIRegE(sz,pfx,modrm));
+            t2 = newTemp(Ity_I64);
+            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
+            putIReg64(R_RSP, mkexpr(t2));
+            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
+            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
+            jmp_treg(Ijk_Call,t3);
+            dres->whatNext = Dis_StopHere;
+            showSz = False;
+            break;
+         case 4: /* jmp Ev */
+            /* Ignore any sz value and operate as if sz==8. */
+            if (!(sz == 4 || sz == 8)) goto unhandled;
+            sz = 8;
+            t3 = newTemp(Ity_I64);
+            assign(t3, getIRegE(sz,pfx,modrm));
+            jmp_treg(Ijk_Boring,t3);
+            dres->whatNext = Dis_StopHere;
+            showSz = False;
+            break;
+         default: 
+            *decode_OK = False;
+            return delta;
+      }
+      delta++;
+      DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
+                       showSz ? nameISize(sz) : ' ', 
+                       nameIRegE(sz, pfx, modrm));
+   } else {
+      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
+      if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
+                                  && gregLO3ofRM(modrm) != 6) {
+         assign(t1, loadLE(ty,mkexpr(addr)));
+      }
+      switch (gregLO3ofRM(modrm)) {
+         case 0: /* INC */ 
+            t2 = newTemp(ty);
+            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
+                             mkexpr(t1), mkU(ty,1)));
+            if (pfx & PFX_LOCK) {
+               casLE( mkexpr(addr),
+                      mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
+            } else {
+               storeLE(mkexpr(addr),mkexpr(t2));
+            }
+            setFlags_INC_DEC( True, t2, ty );
+            break;
+         case 1: /* DEC */ 
+            t2 = newTemp(ty);
+            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
+                             mkexpr(t1), mkU(ty,1)));
+            if (pfx & PFX_LOCK) {
+               casLE( mkexpr(addr),
+                      mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
+            } else {
+               storeLE(mkexpr(addr),mkexpr(t2));
+            }
+            setFlags_INC_DEC( False, t2, ty );
+            break;
+         case 2: /* call Ev */
+            /* Ignore any sz value and operate as if sz==8. */
+            if (!(sz == 4 || sz == 8)) goto unhandled;
+            sz = 8;
+            t3 = newTemp(Ity_I64);
+            assign(t3, loadLE(Ity_I64,mkexpr(addr)));
+            t2 = newTemp(Ity_I64);
+            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
+            putIReg64(R_RSP, mkexpr(t2));
+            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
+            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
+            jmp_treg(Ijk_Call,t3);
+            dres->whatNext = Dis_StopHere;
+            showSz = False;
+            break;
+         case 4: /* JMP Ev */
+            /* Ignore any sz value and operate as if sz==8. */
+            if (!(sz == 4 || sz == 8)) goto unhandled;
+            sz = 8;
+            t3 = newTemp(Ity_I64);
+            assign(t3, loadLE(Ity_I64,mkexpr(addr)));
+            jmp_treg(Ijk_Boring,t3);
+            dres->whatNext = Dis_StopHere;
+            showSz = False;
+            break;
+         case 6: /* PUSH Ev */
+            /* There is no encoding for 32-bit operand size; hence ... */
+            if (sz == 4) sz = 8;
+            if (!(sz == 8 || sz == 2)) goto unhandled;
+            if (sz == 8) {
+               t3 = newTemp(Ity_I64);
+               assign(t3, loadLE(Ity_I64,mkexpr(addr)));
+               t2 = newTemp(Ity_I64);
+               assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
+               putIReg64(R_RSP, mkexpr(t2) );
+               storeLE( mkexpr(t2), mkexpr(t3) );
+               break;
+	    } else {
+               goto unhandled; /* awaiting test case */
+	    }
+         default: 
+         unhandled:
+            *decode_OK = False;
+            return delta;
+      }
+      delta += len;
+      DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
+                       showSz ? nameISize(sz) : ' ', 
+                       dis_buf);
+   }
+   return delta;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Disassembling string ops (including REP prefixes)    ---*/
+/*------------------------------------------------------------*/
+
+/* Code shared by all the string ops */
+static
+void dis_string_op_increment ( Int sz, IRTemp t_inc )
+{
+   UChar logSz;
+   if (sz == 8 || sz == 4 || sz == 2) {
+      logSz = 1;
+      if (sz == 4) logSz = 2;
+      if (sz == 8) logSz = 3;
+      assign( t_inc, 
+              binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
+                               mkU8(logSz) ) );
+   } else {
+      assign( t_inc, 
+              IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
+   }
+}
+
+static
+void dis_string_op( void (*dis_OP)( Int, IRTemp ), 
+                    Int sz, HChar* name, Prefix pfx )
+{
+   IRTemp t_inc = newTemp(Ity_I64);
+   /* Really we ought to inspect the override prefixes, but we don't.
+      The following assertion catches any resulting sillyness. */
+   vassert(pfx == clearSegBits(pfx));
+   dis_string_op_increment(sz, t_inc);
+   dis_OP( sz, t_inc );
+   DIP("%s%c\n", name, nameISize(sz));
+}
+
+static 
+void dis_MOVS ( Int sz, IRTemp t_inc )
+{
+   IRType ty = szToITy(sz);
+   IRTemp td = newTemp(Ity_I64);   /* RDI */
+   IRTemp ts = newTemp(Ity_I64);   /* RSI */
+
+   assign( td, getIReg64(R_RDI) );
+   assign( ts, getIReg64(R_RSI) );
+
+   storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
+
+   putIReg64( R_RDI, binop(Iop_Add64, mkexpr(td), mkexpr(t_inc)) );
+   putIReg64( R_RSI, binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc)) );
+}
+
+static 
+void dis_LODS ( Int sz, IRTemp t_inc )
+{
+   IRType ty = szToITy(sz);
+   IRTemp ts = newTemp(Ity_I64);   /* RSI */
+
+   assign( ts, getIReg64(R_RSI) );
+
+   putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
+
+   putIReg64( R_RSI, binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc)) );
+}
+
+static 
+void dis_STOS ( Int sz, IRTemp t_inc )
+{
+   IRType ty = szToITy(sz);
+   IRTemp ta = newTemp(ty);        /* rAX */
+   IRTemp td = newTemp(Ity_I64);   /* RDI */
+
+   assign( ta, getIRegRAX(sz) );
+
+   assign( td, getIReg64(R_RDI) );
+
+   storeLE( mkexpr(td), mkexpr(ta) );
+
+   putIReg64( R_RDI, binop(Iop_Add64, mkexpr(td), mkexpr(t_inc)) );
+}
+
+static 
+void dis_CMPS ( Int sz, IRTemp t_inc )
+{
+   IRType ty  = szToITy(sz);
+   IRTemp tdv = newTemp(ty);      /* (RDI) */
+   IRTemp tsv = newTemp(ty);      /* (RSI) */
+   IRTemp td  = newTemp(Ity_I64); /*  RDI  */
+   IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
+
+   assign( td, getIReg64(R_RDI) );
+
+   assign( ts, getIReg64(R_RSI) );
+
+   assign( tdv, loadLE(ty,mkexpr(td)) );
+
+   assign( tsv, loadLE(ty,mkexpr(ts)) );
+
+   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
+
+   putIReg64(R_RDI, binop(Iop_Add64, mkexpr(td), mkexpr(t_inc)) );
+
+   putIReg64(R_RSI, binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc)) );
+}
+
+static 
+void dis_SCAS ( Int sz, IRTemp t_inc )
+{
+   IRType ty  = szToITy(sz);
+   IRTemp ta  = newTemp(ty);       /*  rAX  */
+   IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
+   IRTemp tdv = newTemp(ty);       /* (RDI) */
+
+   assign( ta, getIRegRAX(sz) );
+
+   assign( td, getIReg64(R_RDI) );
+
+   assign( tdv, loadLE(ty,mkexpr(td)) );
+
+   setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
+
+   putIReg64(R_RDI, binop(Iop_Add64, mkexpr(td), mkexpr(t_inc)) );
+}
+
+
+/* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
+   the insn is the last one in the basic block, and so emit a jump to
+   the next insn, rather than just falling through. */
+static 
+void dis_REP_op ( AMD64Condcode cond,
+                  void (*dis_OP)(Int, IRTemp),
+                  Int sz, Addr64 rip, Addr64 rip_next, HChar* name,
+                  Prefix pfx )
+{
+   IRTemp t_inc = newTemp(Ity_I64);
+   IRTemp tc    = newTemp(Ity_I64);  /*  RCX  */
+
+   /* Really we ought to inspect the override prefixes, but we don't.
+      The following assertion catches any resulting sillyness. */
+   vassert(pfx == clearSegBits(pfx));
+
+   assign( tc, getIReg64(R_RCX) );
+
+   stmt( IRStmt_Exit( binop(Iop_CmpEQ64,mkexpr(tc),mkU64(0)),
+                      Ijk_Boring,
+                      IRConst_U64(rip_next) ) );
+
+   putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
+
+   dis_string_op_increment(sz, t_inc);
+   dis_OP (sz, t_inc);
+
+   if (cond == AMD64CondAlways) {
+      jmp_lit(Ijk_Boring,rip);
+   } else {
+      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
+                         Ijk_Boring,
+                         IRConst_U64(rip) ) );
+      jmp_lit(Ijk_Boring,rip_next);
+   }
+   DIP("%s%c\n", name, nameISize(sz));
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Arithmetic, etc.                                     ---*/
+/*------------------------------------------------------------*/
+
+/* IMUL E, G.  Supplied eip points to the modR/M byte. */
+static
+ULong dis_mul_E_G ( VexAbiInfo* vbi,
+                    Prefix      pfx,
+                    Int         size, 
+                    Long        delta0 )
+{
+   Int    alen;
+   HChar  dis_buf[50];
+   UChar  rm = getUChar(delta0);
+   IRType ty = szToITy(size);
+   IRTemp te = newTemp(ty);
+   IRTemp tg = newTemp(ty);
+   IRTemp resLo = newTemp(ty);
+
+   assign( tg, getIRegG(size, pfx, rm) );
+   if (epartIsReg(rm)) {
+      assign( te, getIRegE(size, pfx, rm) );
+   } else {
+      IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
+      assign( te, loadLE(ty,mkexpr(addr)) );
+   }
+
+   setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
+
+   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
+
+   putIRegG(size, pfx, rm, mkexpr(resLo) );
+
+   if (epartIsReg(rm)) {
+      DIP("imul%c %s, %s\n", nameISize(size), 
+                             nameIRegE(size,pfx,rm),
+                             nameIRegG(size,pfx,rm));
+      return 1+delta0;
+   } else {
+      DIP("imul%c %s, %s\n", nameISize(size), 
+                             dis_buf, 
+                             nameIRegG(size,pfx,rm));
+      return alen+delta0;
+   }
+}
+
+
+/* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
+static
+ULong dis_imul_I_E_G ( VexAbiInfo* vbi,
+                       Prefix      pfx,
+                       Int         size, 
+                       Long        delta,
+                       Int         litsize )
+{
+   Long   d64;
+   Int    alen;
+   HChar  dis_buf[50];
+   UChar  rm = getUChar(delta);
+   IRType ty = szToITy(size);
+   IRTemp te = newTemp(ty);
+   IRTemp tl = newTemp(ty);
+   IRTemp resLo = newTemp(ty);
+
+   vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
+
+   if (epartIsReg(rm)) {
+      assign(te, getIRegE(size, pfx, rm));
+      delta++;
+   } else {
+      IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 
+                                     imin(4,litsize) );
+      assign(te, loadLE(ty, mkexpr(addr)));
+      delta += alen;
+   }
+   d64 = getSDisp(imin(4,litsize),delta);
+   delta += imin(4,litsize);
+
+   d64 &= mkSizeMask(size);
+   assign(tl, mkU(ty,d64));
+
+   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
+
+   setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
+
+   putIRegG(size, pfx, rm, mkexpr(resLo));
+
+   DIP("imul%c $%lld, %s, %s\n", 
+       nameISize(size), d64, 
+       ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
+       nameIRegG(size,pfx,rm) );
+   return delta;
+}
+
+
+/* Generate an IR sequence to do a popcount operation on the supplied
+   IRTemp, and return a new IRTemp holding the result.  'ty' may be
+   Ity_I16, Ity_I32 or Ity_I64 only. */
+static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
+{
+   Int i;
+   if (ty == Ity_I16) {
+      IRTemp old = IRTemp_INVALID;
+      IRTemp nyu = IRTemp_INVALID;
+      IRTemp mask[4], shift[4];
+      for (i = 0; i < 4; i++) {
+         mask[i]  = newTemp(ty);
+         shift[i] = 1 << i;
+      }
+      assign(mask[0], mkU16(0x5555));
+      assign(mask[1], mkU16(0x3333));
+      assign(mask[2], mkU16(0x0F0F));
+      assign(mask[3], mkU16(0x00FF));
+      old = src;
+      for (i = 0; i < 4; i++) {
+         nyu = newTemp(ty);
+         assign(nyu,
+                binop(Iop_Add16, 
+                      binop(Iop_And16,
+                            mkexpr(old),
+                            mkexpr(mask[i])),
+                      binop(Iop_And16,
+                            binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
+                            mkexpr(mask[i]))));
+         old = nyu;
+      }
+      return nyu;
+   }
+   if (ty == Ity_I32) {
+      IRTemp old = IRTemp_INVALID;
+      IRTemp nyu = IRTemp_INVALID;
+      IRTemp mask[5], shift[5];
+      for (i = 0; i < 5; i++) {
+         mask[i]  = newTemp(ty);
+         shift[i] = 1 << i;
+      }
+      assign(mask[0], mkU32(0x55555555));
+      assign(mask[1], mkU32(0x33333333));
+      assign(mask[2], mkU32(0x0F0F0F0F));
+      assign(mask[3], mkU32(0x00FF00FF));
+      assign(mask[4], mkU32(0x0000FFFF));
+      old = src;
+      for (i = 0; i < 5; i++) {
+         nyu = newTemp(ty);
+         assign(nyu,
+                binop(Iop_Add32, 
+                      binop(Iop_And32,
+                            mkexpr(old),
+                            mkexpr(mask[i])),
+                      binop(Iop_And32,
+                            binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
+                            mkexpr(mask[i]))));
+         old = nyu;
+      }
+      return nyu;
+   }
+   if (ty == Ity_I64) {
+      IRTemp old = IRTemp_INVALID;
+      IRTemp nyu = IRTemp_INVALID;
+      IRTemp mask[6], shift[6];
+      for (i = 0; i < 6; i++) {
+         mask[i]  = newTemp(ty);
+         shift[i] = 1 << i;
+      }
+      assign(mask[0], mkU64(0x5555555555555555ULL));
+      assign(mask[1], mkU64(0x3333333333333333ULL));
+      assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
+      assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
+      assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
+      assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
+      old = src;
+      for (i = 0; i < 6; i++) {
+         nyu = newTemp(ty);
+         assign(nyu,
+                binop(Iop_Add64, 
+                      binop(Iop_And64,
+                            mkexpr(old),
+                            mkexpr(mask[i])),
+                      binop(Iop_And64,
+                            binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
+                            mkexpr(mask[i]))));
+         old = nyu;
+      }
+      return nyu;
+   }
+   /*NOTREACHED*/
+   vassert(0);
+}
+
+
+/* Generate an IR sequence to do a count-leading-zeroes operation on
+   the supplied IRTemp, and return a new IRTemp holding the result.
+   'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
+   the argument is zero, return the number of bits in the word (the
+   natural semantics). */
+static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
+{
+   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
+
+   IRTemp src64 = newTemp(Ity_I64);
+   assign(src64, widenUto64( mkexpr(src) ));
+
+   IRTemp src64x = newTemp(Ity_I64);
+   assign(src64x, 
+          binop(Iop_Shl64, mkexpr(src64),
+                           mkU8(64 - 8 * sizeofIRType(ty))));
+
+   // Clz64 has undefined semantics when its input is zero, so
+   // special-case around that.
+   IRTemp res64 = newTemp(Ity_I64);
+   assign(res64,
+          IRExpr_Mux0X(
+             unop(Iop_1Uto8,
+                  binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0))),
+             unop(Iop_Clz64, mkexpr(src64x)),
+             mkU64(8 * sizeofIRType(ty))
+   ));
+
+   IRTemp res = newTemp(ty);
+   assign(res, narrowTo(ty, mkexpr(res64)));
+   return res;
+}
+
+
+/*------------------------------------------------------------*/
+/*---                                                      ---*/
+/*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
+/*---                                                      ---*/
+/*------------------------------------------------------------*/
+
+/* --- Helper functions for dealing with the register stack. --- */
+
+/* --- Set the emulation-warning pseudo-register. --- */
+
+static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
+   stmt( IRStmt_Put( OFFB_EMWARN, e ) );
+}
+
+/* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
+
+static IRExpr* mkQNaN64 ( void )
+{
+  /* QNaN is 0 2047 1 0(51times) 
+     == 0b 11111111111b 1 0(51times)
+     == 0x7FF8 0000 0000 0000
+   */
+   return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
+}
+
+/* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
+
+static IRExpr* get_ftop ( void )
+{
+   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
+}
+
+static void put_ftop ( IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
+   stmt( IRStmt_Put( OFFB_FTOP, e ) );
+}
+
+/* --------- Get/put the C3210 bits. --------- */
+
+static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
+{
+   return IRExpr_Get( OFFB_FC3210, Ity_I64 );
+}
+
+static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
+   stmt( IRStmt_Put( OFFB_FC3210, e ) );
+}
+
+/* --------- Get/put the FPU rounding mode. --------- */
+static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
+{
+   return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
+}
+
+static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
+   stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
+}
+
+
+/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
+/* Produces a value in 0 .. 3, which is encoded as per the type
+   IRRoundingMode.  Since the guest_FPROUND value is also encoded as
+   per IRRoundingMode, we merely need to get it and mask it for
+   safety.
+*/
+static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
+{
+   return binop( Iop_And32, get_fpround(), mkU32(3) );
+}
+
+static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
+{
+   return mkU32(Irrm_NEAREST);
+}
+
+
+/* --------- Get/set FP register tag bytes. --------- */
+
+/* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
+
+static void put_ST_TAG ( Int i, IRExpr* value )
+{
+   IRRegArray* descr;
+   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
+   descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
+   stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
+}
+
+/* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
+   zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
+
+static IRExpr* get_ST_TAG ( Int i )
+{
+   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
+   return IRExpr_GetI( descr, get_ftop(), i );
+}
+
+
+/* --------- Get/set FP registers. --------- */
+
+/* Given i, and some expression e, emit 'ST(i) = e' and set the
+   register's tag to indicate the register is full.  The previous
+   state of the register is not checked. */
+
+static void put_ST_UNCHECKED ( Int i, IRExpr* value )
+{
+   IRRegArray* descr;
+   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
+   descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
+   stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
+   /* Mark the register as in-use. */
+   put_ST_TAG(i, mkU8(1));
+}
+
+/* Given i, and some expression e, emit
+      ST(i) = is_full(i) ? NaN : e
+   and set the tag accordingly.
+*/
+
+static void put_ST ( Int i, IRExpr* value )
+{
+   put_ST_UNCHECKED( i,
+                     IRExpr_Mux0X( get_ST_TAG(i),
+                                   /* 0 means empty */
+                                   value,
+                                   /* non-0 means full */
+                                   mkQNaN64()
+                   )
+   );
+}
+
+
+/* Given i, generate an expression yielding 'ST(i)'. */
+
+static IRExpr* get_ST_UNCHECKED ( Int i )
+{
+   IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
+   return IRExpr_GetI( descr, get_ftop(), i );
+}
+
+
+/* Given i, generate an expression yielding 
+  is_full(i) ? ST(i) : NaN
+*/
+
+static IRExpr* get_ST ( Int i )
+{
+   return
+      IRExpr_Mux0X( get_ST_TAG(i),
+                    /* 0 means empty */
+                    mkQNaN64(),
+                    /* non-0 means full */
+                    get_ST_UNCHECKED(i));
+}
+
+
+/* Adjust FTOP downwards by one register. */
+
+static void fp_push ( void )
+{
+   put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
+}
+
+/* Adjust FTOP upwards by one register, and mark the vacated register
+   as empty.  */
+
+static void fp_pop ( void )
+{
+   put_ST_TAG(0, mkU8(0));
+   put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
+}
+
+/* Clear the C2 bit of the FPU status register, for
+   sin/cos/tan/sincos. */
+
+static void clear_C2 ( void )
+{
+   put_C3210( binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2)) );
+}
+
+/* Invent a plausible-looking FPU status word value:
+      ((ftop & 7) << 11) | (c3210 & 0x4700)
+ */
+static IRExpr* get_FPU_sw ( void )
+{
+   return
+      unop(Iop_32to16,
+           binop(Iop_Or32,
+                 binop(Iop_Shl32, 
+                       binop(Iop_And32, get_ftop(), mkU32(7)), 
+                             mkU8(11)),
+                       binop(Iop_And32, unop(Iop_64to32, get_C3210()), 
+                                        mkU32(0x4700))
+      ));
+}
+
+
+/* ------------------------------------------------------- */
+/* Given all that stack-mangling junk, we can now go ahead
+   and describe FP instructions. 
+*/
+
+/* ST(0) = ST(0) `op` mem64/32(addr)
+   Need to check ST(0)'s tag on read, but not on write.
+*/
+static
+void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf, 
+                         IROp op, Bool dbl )
+{
+   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
+   if (dbl) {
+      put_ST_UNCHECKED(0, 
+         triop( op, 
+                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                get_ST(0), 
+                loadLE(Ity_F64,mkexpr(addr))
+         ));
+   } else {
+      put_ST_UNCHECKED(0, 
+         triop( op, 
+                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                get_ST(0), 
+                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
+         ));
+   }
+}
+
+
+/* ST(0) = mem64/32(addr) `op` ST(0)
+   Need to check ST(0)'s tag on read, but not on write.
+*/
+static
+void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf, 
+                            IROp op, Bool dbl )
+{
+   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
+   if (dbl) {
+      put_ST_UNCHECKED(0, 
+         triop( op, 
+                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                loadLE(Ity_F64,mkexpr(addr)),
+                get_ST(0)
+         ));
+   } else {
+      put_ST_UNCHECKED(0, 
+         triop( op, 
+                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
+                get_ST(0)
+         ));
+   }
+}
+
+
+/* ST(dst) = ST(dst) `op` ST(src).
+   Check dst and src tags when reading but not on write.
+*/
+static
+void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
+                      Bool pop_after )
+{
+   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
+   put_ST_UNCHECKED( 
+      st_dst, 
+      triop( op, 
+             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+             get_ST(st_dst), 
+             get_ST(st_src) ) 
+   );
+   if (pop_after)
+      fp_pop();
+}
+
+/* ST(dst) = ST(src) `op` ST(dst).
+   Check dst and src tags when reading but not on write.
+*/
+static
+void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
+                         Bool pop_after )
+{
+   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
+   put_ST_UNCHECKED( 
+      st_dst, 
+      triop( op, 
+             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+             get_ST(st_src), 
+             get_ST(st_dst) ) 
+   );
+   if (pop_after)
+      fp_pop();
+}
+
+/* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
+static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
+{
+   DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
+   /* This is a bit of a hack (and isn't really right).  It sets
+      Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
+      documentation implies A and S are unchanged. 
+   */
+   /* It's also fishy in that it is used both for COMIP and
+      UCOMIP, and they aren't the same (although similar). */
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+   stmt( IRStmt_Put( 
+            OFFB_CC_DEP1,
+            binop( Iop_And64,
+                   unop( Iop_32Uto64,
+                         binop(Iop_CmpF64, get_ST(0), get_ST(i))),
+                   mkU64(0x45)
+        )));
+   if (pop_after)
+      fp_pop();
+}
+
+
+/* returns 
+   32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
+*/
+static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
+{
+   IRTemp t32 = newTemp(Ity_I32);
+   assign( t32, e32 );
+   return
+      IRExpr_Mux0X( 
+         unop(Iop_1Uto8, 
+              binop(Iop_CmpLT64U, 
+                    unop(Iop_32Uto64, 
+                         binop(Iop_Add32, mkexpr(t32), mkU32(32768))), 
+                    mkU64(65536))),
+         mkU16( 0x8000 ),
+         unop(Iop_32to16, mkexpr(t32)));
+}
+
+
+static
+ULong dis_FPU ( /*OUT*/Bool* decode_ok, 
+                VexAbiInfo* vbi, Prefix pfx, Long delta )
+{
+   Int    len;
+   UInt   r_src, r_dst;
+   HChar  dis_buf[50];
+   IRTemp t1, t2;
+
+   /* On entry, delta points at the second byte of the insn (the modrm
+      byte).*/
+   UChar first_opcode = getUChar(delta-1);
+   UChar modrm        = getUChar(delta+0);
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
+
+   if (first_opcode == 0xD8) {
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+           specifies an address. */
+         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+         delta += len;
+
+         switch (gregLO3ofRM(modrm)) {
+
+            case 0: /* FADD single-real */
+               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
+               break;
+
+            case 1: /* FMUL single-real */
+               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
+               break;
+
+//..             case 2: /* FCOM single-real */
+//..                DIP("fcoms %s\n", dis_buf);
+//..                /* This forces C1 to zero, which isn't right. */
+//..                put_C3210( 
+//..                    binop( Iop_And32,
+//..                           binop(Iop_Shl32, 
+//..                                 binop(Iop_CmpF64, 
+//..                                       get_ST(0),
+//..                                       unop(Iop_F32toF64, 
+//..                                            loadLE(Ity_F32,mkexpr(addr)))),
+//..                                 mkU8(8)),
+//..                           mkU32(0x4500)
+//..                    ));
+//..                break;  
+//.. 
+//..             case 3: /* FCOMP single-real */
+//..                DIP("fcomps %s\n", dis_buf);
+//..                /* This forces C1 to zero, which isn't right. */
+//..                put_C3210( 
+//..                    binop( Iop_And32,
+//..                           binop(Iop_Shl32, 
+//..                                 binop(Iop_CmpF64, 
+//..                                       get_ST(0),
+//..                                       unop(Iop_F32toF64, 
+//..                                            loadLE(Ity_F32,mkexpr(addr)))),
+//..                                 mkU8(8)),
+//..                           mkU32(0x4500)
+//..                    ));
+//..                fp_pop();
+//..                break;  
+
+            case 4: /* FSUB single-real */
+               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
+               break;
+
+            case 5: /* FSUBR single-real */
+               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
+               break;
+
+            case 6: /* FDIV single-real */
+               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
+               break;
+
+            case 7: /* FDIVR single-real */
+               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
+               break;
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
+               vex_printf("first_opcode == 0xD8\n");
+               goto decode_fail;
+         }
+      } else {
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
+               fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
+               break;
+
+            case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
+               fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
+               break;
+
+            /* Dunno if this is right */
+            case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
+               r_dst = (UInt)modrm - 0xD0;
+               DIP("fcom %%st(0),%%st(%d)\n", r_dst);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   unop(Iop_32Uto64,
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   )));
+               break;
+
+            /* Dunno if this is right */
+            case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
+               r_dst = (UInt)modrm - 0xD8;
+               DIP("fcomp %%st(0),%%st(%d)\n", r_dst);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   unop(Iop_32Uto64,
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   )));
+               fp_pop();
+               break;
+
+            case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
+               fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
+               break;
+
+            case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
+               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
+               break;
+
+            case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
+               fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
+               break;
+
+            case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
+               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
+               break;
+
+            default:
+               goto decode_fail;
+         }
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xD9) {
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+         delta += len;
+
+         switch (gregLO3ofRM(modrm)) {
+
+            case 0: /* FLD single-real */
+               DIP("flds %s\n", dis_buf);
+               fp_push();
+               put_ST(0, unop(Iop_F32toF64,
+                              loadLE(Ity_F32, mkexpr(addr))));
+               break;
+
+            case 2: /* FST single-real */
+               DIP("fsts %s\n", dis_buf);
+               storeLE(mkexpr(addr),
+                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
+               break;
+
+            case 3: /* FSTP single-real */
+               DIP("fstps %s\n", dis_buf);
+               storeLE(mkexpr(addr), 
+                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
+               fp_pop();
+               break;
+
+            case 4: { /* FLDENV m28 */
+               /* Uses dirty helper: 
+                     VexEmWarn amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
+               IRTemp    ew = newTemp(Ity_I32);
+               IRTemp   w64 = newTemp(Ity_I64);
+               IRDirty*   d = unsafeIRDirty_0_N ( 
+                                 0/*regparms*/, 
+                                 "amd64g_dirtyhelper_FLDENV", 
+                                 &amd64g_dirtyhelper_FLDENV,
+                                 mkIRExprVec_1( mkexpr(addr) )
+                              );
+               d->needsBBP = True;
+               d->tmp      = w64;
+               /* declare we're reading memory */
+               d->mFx   = Ifx_Read;
+               d->mAddr = mkexpr(addr);
+               d->mSize = 28;
+
+               /* declare we're writing guest state */
+               d->nFxState = 4;
+
+               d->fxState[0].fx     = Ifx_Write;
+               d->fxState[0].offset = OFFB_FTOP;
+               d->fxState[0].size   = sizeof(UInt);
+
+               d->fxState[1].fx     = Ifx_Write;
+               d->fxState[1].offset = OFFB_FPTAGS;
+               d->fxState[1].size   = 8 * sizeof(UChar);
+
+               d->fxState[2].fx     = Ifx_Write;
+               d->fxState[2].offset = OFFB_FPROUND;
+               d->fxState[2].size   = sizeof(ULong);
+
+               d->fxState[3].fx     = Ifx_Write;
+               d->fxState[3].offset = OFFB_FC3210;
+               d->fxState[3].size   = sizeof(ULong);
+
+               stmt( IRStmt_Dirty(d) );
+
+               /* ew contains any emulation warning we may need to
+                  issue.  If needed, side-exit to the next insn,
+                  reporting the warning, so that Valgrind's dispatcher
+                  sees the warning. */
+	       assign(ew, unop(Iop_64to32,mkexpr(w64)) );
+               put_emwarn( mkexpr(ew) );
+               stmt( 
+                  IRStmt_Exit(
+                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
+                     Ijk_EmWarn,
+                     IRConst_U64( guest_RIP_bbstart+delta )
+                  )
+               );
+
+               DIP("fldenv %s\n", dis_buf);
+               break;
+            }
+
+            case 5: {/* FLDCW */
+               /* The only thing we observe in the control word is the
+                  rounding mode.  Therefore, pass the 16-bit value
+                  (x87 native-format control word) to a clean helper,
+                  getting back a 64-bit value, the lower half of which
+                  is the FPROUND value to store, and the upper half of
+                  which is the emulation-warning token which may be
+                  generated.
+               */
+               /* ULong amd64h_check_fldcw ( ULong ); */
+               IRTemp t64 = newTemp(Ity_I64);
+               IRTemp ew = newTemp(Ity_I32);
+               DIP("fldcw %s\n", dis_buf);
+               assign( t64, mkIRExprCCall(
+                               Ity_I64, 0/*regparms*/, 
+                               "amd64g_check_fldcw",
+                               &amd64g_check_fldcw, 
+                               mkIRExprVec_1( 
+                                  unop( Iop_16Uto64, 
+                                        loadLE(Ity_I16, mkexpr(addr)))
+                               )
+                            )
+                     );
+
+               put_fpround( unop(Iop_64to32, mkexpr(t64)) );
+               assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
+               put_emwarn( mkexpr(ew) );
+               /* Finally, if an emulation warning was reported,
+                  side-exit to the next insn, reporting the warning,
+                  so that Valgrind's dispatcher sees the warning. */
+               stmt( 
+                  IRStmt_Exit(
+                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
+                     Ijk_EmWarn,
+                     IRConst_U64( guest_RIP_bbstart+delta )
+                  )
+               );
+               break;
+            }
+
+            case 6: { /* FNSTENV m28 */
+               /* Uses dirty helper: 
+                     void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
+               IRDirty* d = unsafeIRDirty_0_N ( 
+                               0/*regparms*/, 
+                               "amd64g_dirtyhelper_FSTENV", 
+                               &amd64g_dirtyhelper_FSTENV,
+                               mkIRExprVec_1( mkexpr(addr) )
+                            );
+               d->needsBBP = True;
+               /* declare we're writing memory */
+               d->mFx   = Ifx_Write;
+               d->mAddr = mkexpr(addr);
+               d->mSize = 28;
+
+               /* declare we're reading guest state */
+               d->nFxState = 4;
+
+               d->fxState[0].fx     = Ifx_Read;
+               d->fxState[0].offset = OFFB_FTOP;
+               d->fxState[0].size   = sizeof(UInt);
+
+               d->fxState[1].fx     = Ifx_Read;
+               d->fxState[1].offset = OFFB_FPTAGS;
+               d->fxState[1].size   = 8 * sizeof(UChar);
+
+               d->fxState[2].fx     = Ifx_Read;
+               d->fxState[2].offset = OFFB_FPROUND;
+               d->fxState[2].size   = sizeof(ULong);
+
+               d->fxState[3].fx     = Ifx_Read;
+               d->fxState[3].offset = OFFB_FC3210;
+               d->fxState[3].size   = sizeof(ULong);
+
+               stmt( IRStmt_Dirty(d) );
+
+               DIP("fnstenv %s\n", dis_buf);
+               break;
+            }
+
+            case 7: /* FNSTCW */
+               /* Fake up a native x87 FPU control word.  The only
+                  thing it depends on is FPROUND[1:0], so call a clean
+                  helper to cook it up. */
+               /* ULong amd64g_create_fpucw ( ULong fpround ) */
+               DIP("fnstcw %s\n", dis_buf);
+               storeLE(
+                  mkexpr(addr), 
+                  unop( Iop_64to16, 
+                        mkIRExprCCall(
+                           Ity_I64, 0/*regp*/,
+                           "amd64g_create_fpucw", &amd64g_create_fpucw, 
+                           mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) ) 
+                        ) 
+                  ) 
+               );
+               break;
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
+               vex_printf("first_opcode == 0xD9\n");
+               goto decode_fail;
+         }
+
+      } else {
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FLD %st(?) */
+               r_src = (UInt)modrm - 0xC0;
+               DIP("fld %%st(%u)\n", r_src);
+               t1 = newTemp(Ity_F64);
+               assign(t1, get_ST(r_src));
+               fp_push();
+               put_ST(0, mkexpr(t1));
+               break;
+
+            case 0xC8 ... 0xCF: /* FXCH %st(?) */
+               r_src = (UInt)modrm - 0xC8;
+               DIP("fxch %%st(%u)\n", r_src);
+               t1 = newTemp(Ity_F64);
+               t2 = newTemp(Ity_F64);
+               assign(t1, get_ST(0));
+               assign(t2, get_ST(r_src));
+               put_ST_UNCHECKED(0, mkexpr(t2));
+               put_ST_UNCHECKED(r_src, mkexpr(t1));
+               break;
+
+            case 0xE0: /* FCHS */
+               DIP("fchs\n");
+               put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
+               break;
+
+            case 0xE1: /* FABS */
+               DIP("fabs\n");
+               put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
+               break;
+
+            case 0xE5: { /* FXAM */
+               /* This is an interesting one.  It examines %st(0),
+                  regardless of whether the tag says it's empty or not.
+                  Here, just pass both the tag (in our format) and the
+                  value (as a double, actually a ULong) to a helper
+                  function. */
+               IRExpr** args
+                  = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
+                                   unop(Iop_ReinterpF64asI64, 
+                                        get_ST_UNCHECKED(0)) );
+               put_C3210(mkIRExprCCall(
+                            Ity_I64, 
+                            0/*regparm*/, 
+                            "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
+                            args
+                        ));
+               DIP("fxam\n");
+               break;
+            }
+
+            case 0xE8: /* FLD1 */
+               DIP("fld1\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
+               break;
+
+            case 0xE9: /* FLDL2T */
+               DIP("fldl2t\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
+               break;
+
+            case 0xEA: /* FLDL2E */
+               DIP("fldl2e\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
+               break;
+
+            case 0xEB: /* FLDPI */
+               DIP("fldpi\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
+               break;
+
+            case 0xEC: /* FLDLG2 */
+               DIP("fldlg2\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
+               break;
+
+            case 0xED: /* FLDLN2 */
+               DIP("fldln2\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
+               break;
+
+            case 0xEE: /* FLDZ */
+               DIP("fldz\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
+               break;
+
+            case 0xF0: /* F2XM1 */
+               DIP("f2xm1\n");
+               put_ST_UNCHECKED(0, 
+                  binop(Iop_2xm1F64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0)));
+               break;
+
+            case 0xF1: /* FYL2X */
+               DIP("fyl2x\n");
+               put_ST_UNCHECKED(1, 
+                  triop(Iop_Yl2xF64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(1), 
+                        get_ST(0)));
+               fp_pop();
+               break;
+
+            case 0xF2: /* FPTAN */
+               DIP("ftan\n");
+               put_ST_UNCHECKED(0, 
+                  binop(Iop_TanF64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0)));
+               fp_push();
+               put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
+               clear_C2(); /* HACK */
+               break;
+
+            case 0xF3: /* FPATAN */
+               DIP("fpatan\n");
+               put_ST_UNCHECKED(1, 
+                  triop(Iop_AtanF64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(1), 
+                        get_ST(0)));
+               fp_pop();
+               break;
+
+            case 0xF4: { /* FXTRACT */
+               IRTemp argF = newTemp(Ity_F64);
+               IRTemp sigF = newTemp(Ity_F64);
+               IRTemp expF = newTemp(Ity_F64);
+               IRTemp argI = newTemp(Ity_I64);
+               IRTemp sigI = newTemp(Ity_I64);
+               IRTemp expI = newTemp(Ity_I64);
+               DIP("fxtract\n");
+               assign( argF, get_ST(0) );
+               assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
+               assign( sigI, 
+                       mkIRExprCCall(
+                          Ity_I64, 0/*regparms*/, 
+                          "x86amd64g_calculate_FXTRACT", 
+                          &x86amd64g_calculate_FXTRACT, 
+                          mkIRExprVec_2( mkexpr(argI), 
+                                         mkIRExpr_HWord(0)/*sig*/ )) 
+               );
+               assign( expI, 
+                       mkIRExprCCall(
+                          Ity_I64, 0/*regparms*/, 
+                          "x86amd64g_calculate_FXTRACT", 
+                          &x86amd64g_calculate_FXTRACT, 
+                          mkIRExprVec_2( mkexpr(argI), 
+                                         mkIRExpr_HWord(1)/*exp*/ )) 
+               );
+               assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
+               assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
+               /* exponent */
+               put_ST_UNCHECKED(0, mkexpr(expF) );
+               fp_push();
+               /* significand */
+               put_ST(0, mkexpr(sigF) );
+               break;
+            }
+
+            case 0xF5: { /* FPREM1 -- IEEE compliant */
+               IRTemp a1 = newTemp(Ity_F64);
+               IRTemp a2 = newTemp(Ity_F64);
+               DIP("fprem1\n");
+               /* Do FPREM1 twice, once to get the remainder, and once
+                  to get the C3210 flag values. */
+               assign( a1, get_ST(0) );
+               assign( a2, get_ST(1) );
+               put_ST_UNCHECKED(0,
+                  triop(Iop_PRem1F64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        mkexpr(a1),
+                        mkexpr(a2)));
+               put_C3210(
+                  unop(Iop_32Uto64,
+                  triop(Iop_PRem1C3210F64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        mkexpr(a1),
+                        mkexpr(a2)) ));
+               break;
+            }
+
+            case 0xF7: /* FINCSTP */
+               DIP("fincstp\n");
+               put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
+               break;
+
+            case 0xF8: { /* FPREM -- not IEEE compliant */
+               IRTemp a1 = newTemp(Ity_F64);
+               IRTemp a2 = newTemp(Ity_F64);
+               DIP("fprem\n");
+               /* Do FPREM twice, once to get the remainder, and once
+                  to get the C3210 flag values. */
+               assign( a1, get_ST(0) );
+               assign( a2, get_ST(1) );
+               put_ST_UNCHECKED(0,
+                  triop(Iop_PRemF64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        mkexpr(a1),
+                        mkexpr(a2)));
+               put_C3210(
+                  unop(Iop_32Uto64,
+                  triop(Iop_PRemC3210F64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        mkexpr(a1),
+                        mkexpr(a2)) ));
+               break;
+            }
+
+            case 0xF9: /* FYL2XP1 */
+               DIP("fyl2xp1\n");
+               put_ST_UNCHECKED(1, 
+                  triop(Iop_Yl2xp1F64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(1), 
+                        get_ST(0)));
+               fp_pop();
+               break;
+
+            case 0xFA: /* FSQRT */
+               DIP("fsqrt\n");
+               put_ST_UNCHECKED(0, 
+                  binop(Iop_SqrtF64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0)));
+               break;
+
+            case 0xFB: { /* FSINCOS */
+               IRTemp a1 = newTemp(Ity_F64);
+               assign( a1, get_ST(0) );
+               DIP("fsincos\n");
+               put_ST_UNCHECKED(0, 
+                  binop(Iop_SinF64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        mkexpr(a1)));
+               fp_push();
+               put_ST(0, 
+                  binop(Iop_CosF64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        mkexpr(a1)));
+               clear_C2(); /* HACK */
+               break;
+            }
+
+            case 0xFC: /* FRNDINT */
+               DIP("frndint\n");
+               put_ST_UNCHECKED(0,
+                  binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
+               break;
+
+            case 0xFD: /* FSCALE */
+               DIP("fscale\n");
+               put_ST_UNCHECKED(0, 
+                  triop(Iop_ScaleF64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0), 
+                        get_ST(1)));
+               break;
+
+            case 0xFE: /* FSIN */
+               DIP("fsin\n");
+               put_ST_UNCHECKED(0, 
+                  binop(Iop_SinF64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0)));
+               clear_C2(); /* HACK */
+               break;
+
+            case 0xFF: /* FCOS */
+               DIP("fcos\n");
+               put_ST_UNCHECKED(0, 
+                  binop(Iop_CosF64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0)));
+               clear_C2(); /* HACK */
+               break;
+
+            default:
+               goto decode_fail;
+         }
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDA) {
+
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IROp   fop;
+         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+         delta += len;
+         switch (gregLO3ofRM(modrm)) {
+
+            case 0: /* FIADD m32int */ /* ST(0) += m32int */
+               DIP("fiaddl %s\n", dis_buf);
+               fop = Iop_AddF64;
+               goto do_fop_m32;
+
+            case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
+               DIP("fimull %s\n", dis_buf);
+               fop = Iop_MulF64;
+               goto do_fop_m32;
+
+            case 4: /* FISUB m32int */ /* ST(0) -= m32int */
+               DIP("fisubl %s\n", dis_buf);
+               fop = Iop_SubF64;
+               goto do_fop_m32;
+
+            case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
+               DIP("fisubrl %s\n", dis_buf);
+               fop = Iop_SubF64;
+               goto do_foprev_m32;
+
+            case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
+               DIP("fisubl %s\n", dis_buf);
+               fop = Iop_DivF64;
+               goto do_fop_m32;
+
+            case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
+               DIP("fidivrl %s\n", dis_buf);
+               fop = Iop_DivF64;
+               goto do_foprev_m32;
+
+            do_fop_m32:
+               put_ST_UNCHECKED(0, 
+                  triop(fop, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0),
+                        unop(Iop_I32StoF64,
+                             loadLE(Ity_I32, mkexpr(addr)))));
+               break;
+
+            do_foprev_m32:
+               put_ST_UNCHECKED(0, 
+                  triop(fop, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        unop(Iop_I32StoF64,
+                             loadLE(Ity_I32, mkexpr(addr))),
+                        get_ST(0)));
+               break;
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
+               vex_printf("first_opcode == 0xDA\n");
+               goto decode_fail;
+         }
+
+      } else {
+
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xC0;
+               DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_amd64g_calculate_condition(AMD64CondB)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xC8;
+               DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_amd64g_calculate_condition(AMD64CondZ)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xD0;
+               DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_amd64g_calculate_condition(AMD64CondBE)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xD8;
+               DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_amd64g_calculate_condition(AMD64CondP)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xE9: /* FUCOMPP %st(0),%st(1) */
+               DIP("fucompp %%st(0),%%st(1)\n");
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   unop(Iop_32Uto64,
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   )));
+               fp_pop();
+               fp_pop();
+               break;
+
+            default:
+               goto decode_fail;
+         }
+
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDB) {
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+         delta += len;
+
+         switch (gregLO3ofRM(modrm)) {
+
+            case 0: /* FILD m32int */
+               DIP("fildl %s\n", dis_buf);
+               fp_push();
+               put_ST(0, unop(Iop_I32StoF64,
+                              loadLE(Ity_I32, mkexpr(addr))));
+               break;
+
+            case 1: /* FISTTPL m32 (SSE3) */
+               DIP("fisttpl %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
+               fp_pop();
+               break;
+
+            case 2: /* FIST m32 */
+               DIP("fistl %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
+               break;
+
+            case 3: /* FISTP m32 */
+               DIP("fistpl %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
+               fp_pop();
+               break;
+
+            case 5: { /* FLD extended-real */
+               /* Uses dirty helper: 
+                     ULong amd64g_loadF80le ( ULong )
+                  addr holds the address.  First, do a dirty call to
+                  get hold of the data. */
+               IRTemp   val  = newTemp(Ity_I64);
+               IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
+
+               IRDirty* d = unsafeIRDirty_1_N ( 
+                               val, 
+                               0/*regparms*/, 
+                               "amd64g_dirtyhelper_loadF80le", 
+                               &amd64g_dirtyhelper_loadF80le, 
+                               args 
+                            );
+               /* declare that we're reading memory */
+               d->mFx   = Ifx_Read;
+               d->mAddr = mkexpr(addr);
+               d->mSize = 10;
+
+               /* execute the dirty call, dumping the result in val. */
+               stmt( IRStmt_Dirty(d) );
+               fp_push();
+               put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
+
+               DIP("fldt %s\n", dis_buf);
+               break;
+            }
+
+            case 7: { /* FSTP extended-real */
+               /* Uses dirty helper: 
+                     void amd64g_storeF80le ( ULong addr, ULong data ) 
+               */
+               IRExpr** args 
+                  = mkIRExprVec_2( mkexpr(addr), 
+                                   unop(Iop_ReinterpF64asI64, get_ST(0)) );
+
+               IRDirty* d = unsafeIRDirty_0_N ( 
+                               0/*regparms*/, 
+                               "amd64g_dirtyhelper_storeF80le", 
+                               &amd64g_dirtyhelper_storeF80le,
+                               args 
+                            );
+               /* declare we're writing memory */
+               d->mFx   = Ifx_Write;
+               d->mAddr = mkexpr(addr);
+               d->mSize = 10;
+
+               /* execute the dirty call. */
+               stmt( IRStmt_Dirty(d) );
+               fp_pop();
+
+               DIP("fstpt\n %s", dis_buf);
+               break;
+            }
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
+               vex_printf("first_opcode == 0xDB\n");
+               goto decode_fail;
+         }
+
+      } else {
+
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xC0;
+               DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_amd64g_calculate_condition(AMD64CondNB)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xC8;
+               DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
+               put_ST_UNCHECKED(
+                  0, 
+                  IRExpr_Mux0X( 
+                     unop(Iop_1Uto8,
+                          mk_amd64g_calculate_condition(AMD64CondNZ)), 
+                     get_ST(0), 
+                     get_ST(r_src)
+                  )
+               );
+               break;
+
+            case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xD0;
+               DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
+               put_ST_UNCHECKED(
+                  0, 
+                  IRExpr_Mux0X( 
+                     unop(Iop_1Uto8,
+                          mk_amd64g_calculate_condition(AMD64CondNBE)), 
+                     get_ST(0), 
+                     get_ST(r_src)
+                  ) 
+               );
+               break;
+
+            case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xD8;
+               DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
+               put_ST_UNCHECKED(
+                  0, 
+                  IRExpr_Mux0X( 
+                     unop(Iop_1Uto8,
+                          mk_amd64g_calculate_condition(AMD64CondNP)), 
+                     get_ST(0), 
+                     get_ST(r_src)
+                  )
+               );
+               break;
+
+            case 0xE2:
+               DIP("fnclex\n");
+               break;
+
+            case 0xE3: {
+               /* Uses dirty helper: 
+                     void amd64g_do_FINIT ( VexGuestAMD64State* ) */
+               IRDirty* d  = unsafeIRDirty_0_N ( 
+                                0/*regparms*/, 
+                                "amd64g_dirtyhelper_FINIT", 
+                                &amd64g_dirtyhelper_FINIT,
+                                mkIRExprVec_0()
+                             );
+               d->needsBBP = True;
+
+               /* declare we're writing guest state */
+               d->nFxState = 5;
+
+               d->fxState[0].fx     = Ifx_Write;
+               d->fxState[0].offset = OFFB_FTOP;
+               d->fxState[0].size   = sizeof(UInt);
+
+               d->fxState[1].fx     = Ifx_Write;
+               d->fxState[1].offset = OFFB_FPREGS;
+               d->fxState[1].size   = 8 * sizeof(ULong);
+
+               d->fxState[2].fx     = Ifx_Write;
+               d->fxState[2].offset = OFFB_FPTAGS;
+               d->fxState[2].size   = 8 * sizeof(UChar);
+
+               d->fxState[3].fx     = Ifx_Write;
+               d->fxState[3].offset = OFFB_FPROUND;
+               d->fxState[3].size   = sizeof(ULong);
+
+               d->fxState[4].fx     = Ifx_Write;
+               d->fxState[4].offset = OFFB_FC3210;
+               d->fxState[4].size   = sizeof(ULong);
+
+               stmt( IRStmt_Dirty(d) );
+
+               DIP("fninit\n");
+               break;
+            }
+
+            case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
+               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
+               break;
+
+            case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
+               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
+               break;
+
+            default:
+               goto decode_fail;
+         }
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDC) {
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+         delta += len;
+
+         switch (gregLO3ofRM(modrm)) {
+
+            case 0: /* FADD double-real */
+               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
+               break;
+
+            case 1: /* FMUL double-real */
+               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
+               break;
+
+//..             case 2: /* FCOM double-real */
+//..                DIP("fcoml %s\n", dis_buf);
+//..                /* This forces C1 to zero, which isn't right. */
+//..                put_C3210( 
+//..                    binop( Iop_And32,
+//..                           binop(Iop_Shl32, 
+//..                                 binop(Iop_CmpF64, 
+//..                                       get_ST(0),
+//..                                       loadLE(Ity_F64,mkexpr(addr))),
+//..                                 mkU8(8)),
+//..                           mkU32(0x4500)
+//..                    ));
+//..                break;  
+
+            case 3: /* FCOMP double-real */
+               DIP("fcompl %s\n", dis_buf);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   unop(Iop_32Uto64,
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, 
+                                      get_ST(0),
+                                      loadLE(Ity_F64,mkexpr(addr))),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   )));
+               fp_pop();
+               break;  
+
+            case 4: /* FSUB double-real */
+               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
+               break;
+
+            case 5: /* FSUBR double-real */
+               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
+               break;
+
+            case 6: /* FDIV double-real */
+               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
+               break;
+
+            case 7: /* FDIVR double-real */
+               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
+               break;
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
+               vex_printf("first_opcode == 0xDC\n");
+               goto decode_fail;
+         }
+
+      } else {
+
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
+               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
+               break;
+
+            case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
+               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
+               break;
+
+            case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
+               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
+               break;
+
+            case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
+               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
+               break;
+
+            case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
+               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
+               break;
+
+            case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
+               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
+               break;
+
+            default:
+               goto decode_fail;
+         }
+
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDD) {
+
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+         delta += len;
+
+         switch (gregLO3ofRM(modrm)) {
+
+            case 0: /* FLD double-real */
+               DIP("fldl %s\n", dis_buf);
+               fp_push();
+               put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
+               break;
+
+            case 1: /* FISTTPQ m64 (SSE3) */
+               DIP("fistppll %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
+               fp_pop();
+               break;
+
+            case 2: /* FST double-real */
+               DIP("fstl %s\n", dis_buf);
+               storeLE(mkexpr(addr), get_ST(0));
+               break;
+
+            case 3: /* FSTP double-real */
+               DIP("fstpl %s\n", dis_buf);
+               storeLE(mkexpr(addr), get_ST(0));
+               fp_pop();
+               break;
+
+//..             case 4: { /* FRSTOR m108 */
+//..                /* Uses dirty helper: 
+//..                      VexEmWarn x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
+//..                IRTemp   ew = newTemp(Ity_I32);
+//..                IRDirty* d  = unsafeIRDirty_0_N ( 
+//..                                 0/*regparms*/, 
+//..                                 "x86g_dirtyhelper_FRSTOR", 
+//..                                 &x86g_dirtyhelper_FRSTOR,
+//..                                 mkIRExprVec_1( mkexpr(addr) )
+//..                              );
+//..                d->needsBBP = True;
+//..                d->tmp      = ew;
+//..                /* declare we're reading memory */
+//..                d->mFx   = Ifx_Read;
+//..                d->mAddr = mkexpr(addr);
+//..                d->mSize = 108;
+//.. 
+//..                /* declare we're writing guest state */
+//..                d->nFxState = 5;
+//.. 
+//..                d->fxState[0].fx     = Ifx_Write;
+//..                d->fxState[0].offset = OFFB_FTOP;
+//..                d->fxState[0].size   = sizeof(UInt);
+//.. 
+//..                d->fxState[1].fx     = Ifx_Write;
+//..                d->fxState[1].offset = OFFB_FPREGS;
+//..                d->fxState[1].size   = 8 * sizeof(ULong);
+//.. 
+//..                d->fxState[2].fx     = Ifx_Write;
+//..                d->fxState[2].offset = OFFB_FPTAGS;
+//..                d->fxState[2].size   = 8 * sizeof(UChar);
+//.. 
+//..                d->fxState[3].fx     = Ifx_Write;
+//..                d->fxState[3].offset = OFFB_FPROUND;
+//..                d->fxState[3].size   = sizeof(UInt);
+//.. 
+//..                d->fxState[4].fx     = Ifx_Write;
+//..                d->fxState[4].offset = OFFB_FC3210;
+//..                d->fxState[4].size   = sizeof(UInt);
+//.. 
+//..                stmt( IRStmt_Dirty(d) );
+//.. 
+//..                /* ew contains any emulation warning we may need to
+//..                   issue.  If needed, side-exit to the next insn,
+//..                   reporting the warning, so that Valgrind's dispatcher
+//..                   sees the warning. */
+//..                put_emwarn( mkexpr(ew) );
+//..                stmt( 
+//..                   IRStmt_Exit(
+//..                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
+//..                      Ijk_EmWarn,
+//..                      IRConst_U32( ((Addr32)guest_eip_bbstart)+delta)
+//..                   )
+//..                );
+//.. 
+//..                DIP("frstor %s\n", dis_buf);
+//..                break;
+//..             }
+//.. 
+//..             case 6: { /* FNSAVE m108 */
+//..                /* Uses dirty helper: 
+//..                      void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
+//..                IRDirty* d = unsafeIRDirty_0_N ( 
+//..                                0/*regparms*/, 
+//..                                "x86g_dirtyhelper_FSAVE", 
+//..                                &x86g_dirtyhelper_FSAVE,
+//..                                mkIRExprVec_1( mkexpr(addr) )
+//..                             );
+//..                d->needsBBP = True;
+//..                /* declare we're writing memory */
+//..                d->mFx   = Ifx_Write;
+//..                d->mAddr = mkexpr(addr);
+//..                d->mSize = 108;
+//.. 
+//..                /* declare we're reading guest state */
+//..                d->nFxState = 5;
+//.. 
+//..                d->fxState[0].fx     = Ifx_Read;
+//..                d->fxState[0].offset = OFFB_FTOP;
+//..                d->fxState[0].size   = sizeof(UInt);
+//.. 
+//..                d->fxState[1].fx     = Ifx_Read;
+//..                d->fxState[1].offset = OFFB_FPREGS;
+//..                d->fxState[1].size   = 8 * sizeof(ULong);
+//.. 
+//..                d->fxState[2].fx     = Ifx_Read;
+//..                d->fxState[2].offset = OFFB_FPTAGS;
+//..                d->fxState[2].size   = 8 * sizeof(UChar);
+//.. 
+//..                d->fxState[3].fx     = Ifx_Read;
+//..                d->fxState[3].offset = OFFB_FPROUND;
+//..                d->fxState[3].size   = sizeof(UInt);
+//.. 
+//..                d->fxState[4].fx     = Ifx_Read;
+//..                d->fxState[4].offset = OFFB_FC3210;
+//..                d->fxState[4].size   = sizeof(UInt);
+//.. 
+//..                stmt( IRStmt_Dirty(d) );
+//.. 
+//..                DIP("fnsave %s\n", dis_buf);
+//..                break;
+//..             }
+
+            case 7: { /* FNSTSW m16 */
+               IRExpr* sw = get_FPU_sw();
+               vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
+               storeLE( mkexpr(addr), sw );
+               DIP("fnstsw %s\n", dis_buf);
+               break;
+            }
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
+               vex_printf("first_opcode == 0xDD\n");
+               goto decode_fail;
+         }
+      } else {
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FFREE %st(?) */
+               r_dst = (UInt)modrm - 0xC0;
+               DIP("ffree %%st(%u)\n", r_dst);
+               put_ST_TAG ( r_dst, mkU8(0) );
+               break;
+
+            case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
+               r_dst = (UInt)modrm - 0xD0;
+               DIP("fst %%st(0),%%st(%u)\n", r_dst);
+               /* P4 manual says: "If the destination operand is a
+                  non-empty register, the invalid-operation exception
+                  is not generated.  Hence put_ST_UNCHECKED. */
+               put_ST_UNCHECKED(r_dst, get_ST(0));
+               break;
+
+            case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
+               r_dst = (UInt)modrm - 0xD8;
+               DIP("fstp %%st(0),%%st(%u)\n", r_dst);
+               /* P4 manual says: "If the destination operand is a
+                  non-empty register, the invalid-operation exception
+                  is not generated.  Hence put_ST_UNCHECKED. */
+               put_ST_UNCHECKED(r_dst, get_ST(0));
+               fp_pop();
+               break;
+
+            case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
+               r_dst = (UInt)modrm - 0xE0;
+               DIP("fucom %%st(0),%%st(%u)\n", r_dst);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210(
+                   unop(Iop_32Uto64, 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   )));
+               break;
+
+            case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
+               r_dst = (UInt)modrm - 0xE8;
+               DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   unop(Iop_32Uto64, 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   )));
+               fp_pop();
+               break;
+
+            default:
+               goto decode_fail;
+         }
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDE) {
+
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IROp   fop;
+         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+         delta += len;
+
+         switch (gregLO3ofRM(modrm)) {
+
+            case 0: /* FIADD m16int */ /* ST(0) += m16int */
+               DIP("fiaddw %s\n", dis_buf);
+               fop = Iop_AddF64;
+               goto do_fop_m16;
+
+            case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
+               DIP("fimulw %s\n", dis_buf);
+               fop = Iop_MulF64;
+               goto do_fop_m16;
+
+            case 4: /* FISUB m16int */ /* ST(0) -= m16int */
+               DIP("fisubw %s\n", dis_buf);
+               fop = Iop_SubF64;
+               goto do_fop_m16;
+
+            case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
+               DIP("fisubrw %s\n", dis_buf);
+               fop = Iop_SubF64;
+               goto do_foprev_m16;
+
+            case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
+               DIP("fisubw %s\n", dis_buf);
+               fop = Iop_DivF64;
+               goto do_fop_m16;
+
+            case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
+               DIP("fidivrw %s\n", dis_buf);
+               fop = Iop_DivF64;
+               goto do_foprev_m16;
+
+            do_fop_m16:
+               put_ST_UNCHECKED(0, 
+                  triop(fop, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0),
+                        unop(Iop_I32StoF64,
+                             unop(Iop_16Sto32, 
+                                  loadLE(Ity_I16, mkexpr(addr))))));
+               break;
+
+            do_foprev_m16:
+               put_ST_UNCHECKED(0, 
+                  triop(fop, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        unop(Iop_I32StoF64,
+                             unop(Iop_16Sto32, 
+                                  loadLE(Ity_I16, mkexpr(addr)))),
+                        get_ST(0)));
+               break;
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
+               vex_printf("first_opcode == 0xDE\n");
+               goto decode_fail;
+         }
+
+      } else {
+
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
+               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
+               break;
+
+            case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
+               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
+               break;
+
+            case 0xD9: /* FCOMPP %st(0),%st(1) */
+               DIP("fcompp %%st(0),%%st(1)\n");
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   unop(Iop_32Uto64,
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   )));
+               fp_pop();
+               fp_pop();
+               break;
+
+            case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
+               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
+               break;
+
+            case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
+               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
+               break;
+
+            case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
+               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
+               break;
+
+            case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
+               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
+               break;
+
+            default: 
+               goto decode_fail;
+         }
+
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDF) {
+
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+         delta += len;
+
+         switch (gregLO3ofRM(modrm)) {
+
+            case 0: /* FILD m16int */
+               DIP("fildw %s\n", dis_buf);
+               fp_push();
+               put_ST(0, unop(Iop_I32StoF64,
+                              unop(Iop_16Sto32,
+                                   loadLE(Ity_I16, mkexpr(addr)))));
+               break;
+
+            case 1: /* FISTTPS m16 (SSE3) */
+               DIP("fisttps %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        x87ishly_qnarrow_32_to_16( 
+                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
+               fp_pop();
+               break;
+
+            case 2: /* FIST m16 */
+               DIP("fists %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        x87ishly_qnarrow_32_to_16(
+                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
+               break;
+
+            case 3: /* FISTP m16 */
+               DIP("fistps %s\n", dis_buf);
+               storeLE( mkexpr(addr),
+                        x87ishly_qnarrow_32_to_16( 
+                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
+               fp_pop();
+               break;
+
+            case 5: /* FILD m64 */
+               DIP("fildll %s\n", dis_buf);
+               fp_push();
+               put_ST(0, binop(Iop_I64StoF64,
+                               get_roundingmode(),
+                               loadLE(Ity_I64, mkexpr(addr))));
+               break;
+
+            case 7: /* FISTP m64 */
+               DIP("fistpll %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
+               fp_pop();
+               break;
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
+               vex_printf("first_opcode == 0xDF\n");
+               goto decode_fail;
+         }
+
+      } else {
+
+         delta++;
+         switch (modrm) {
+
+            case 0xC0: /* FFREEP %st(0) */
+               DIP("ffreep %%st(%d)\n", 0);
+               put_ST_TAG ( 0, mkU8(0) );
+               fp_pop();
+               break;
+
+            case 0xE0: /* FNSTSW %ax */
+               DIP("fnstsw %%ax\n");
+               /* Invent a plausible-looking FPU status word value and
+                  dump it in %AX:
+                     ((ftop & 7) << 11) | (c3210 & 0x4700)
+               */
+               putIRegRAX(
+                  2,
+                  unop(Iop_32to16,
+                       binop(Iop_Or32,
+                             binop(Iop_Shl32, 
+                                   binop(Iop_And32, get_ftop(), mkU32(7)), 
+                                   mkU8(11)),
+                             binop(Iop_And32, 
+                                   unop(Iop_64to32, get_C3210()), 
+                                   mkU32(0x4700))
+               )));
+               break;
+
+            case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
+               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
+               break;
+
+            case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
+               /* not really right since COMIP != UCOMIP */
+               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
+               break;
+
+            default: 
+               goto decode_fail;
+         }
+      }
+
+   }
+
+   else
+      goto decode_fail;
+
+   *decode_ok = True;
+   return delta;
+
+  decode_fail:
+   *decode_ok = False;
+   return delta;
+}
+
+
+/*------------------------------------------------------------*/
+/*---                                                      ---*/
+/*--- MMX INSTRUCTIONS                                     ---*/
+/*---                                                      ---*/
+/*------------------------------------------------------------*/
+
+/* Effect of MMX insns on x87 FPU state (table 11-2 of 
+   IA32 arch manual, volume 3):
+
+   Read from, or write to MMX register (viz, any insn except EMMS):
+   * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
+   * FP stack pointer set to zero
+
+   EMMS:
+   * All tags set to Invalid (empty) -- FPTAGS[i] := zero
+   * FP stack pointer set to zero
+*/
+
+static void do_MMX_preamble ( void )
+{
+   Int         i;
+   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
+   IRExpr*     zero  = mkU32(0);
+   IRExpr*     tag1  = mkU8(1);
+   put_ftop(zero);
+   for (i = 0; i < 8; i++)
+      stmt( IRStmt_PutI( descr, zero, i, tag1 ) );
+}
+
+static void do_EMMS_preamble ( void )
+{
+   Int         i;
+   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
+   IRExpr*     zero  = mkU32(0);
+   IRExpr*     tag0  = mkU8(0);
+   put_ftop(zero);
+   for (i = 0; i < 8; i++)
+      stmt( IRStmt_PutI( descr, zero, i, tag0 ) );
+}
+
+
+static IRExpr* getMMXReg ( UInt archreg )
+{
+   vassert(archreg < 8);
+   return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
+}
+
+
+static void putMMXReg ( UInt archreg, IRExpr* e )
+{
+   vassert(archreg < 8);
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
+   stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
+}
+
+
+/* Helper for non-shift MMX insns.  Note this is incomplete in the
+   sense that it does not first call do_MMX_preamble() -- that is the
+   responsibility of its caller. */
+
+static 
+ULong dis_MMXop_regmem_to_reg ( VexAbiInfo* vbi,
+                                Prefix      pfx,
+                                Long        delta,
+                                UChar       opc,
+                                HChar*      name,
+                                Bool        show_granularity )
+{
+   HChar   dis_buf[50];
+   UChar   modrm = getUChar(delta);
+   Bool    isReg = epartIsReg(modrm);
+   IRExpr* argL  = NULL;
+   IRExpr* argR  = NULL;
+   IRExpr* argG  = NULL;
+   IRExpr* argE  = NULL;
+   IRTemp  res   = newTemp(Ity_I64);
+
+   Bool    invG  = False;
+   IROp    op    = Iop_INVALID;
+   void*   hAddr = NULL;
+   HChar*  hName = NULL;
+   Bool    eLeft = False;
+
+#  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
+
+   switch (opc) {
+      /* Original MMX ones */
+      case 0xFC: op = Iop_Add8x8; break;
+      case 0xFD: op = Iop_Add16x4; break;
+      case 0xFE: op = Iop_Add32x2; break;
+
+      case 0xEC: op = Iop_QAdd8Sx8; break;
+      case 0xED: op = Iop_QAdd16Sx4; break;
+
+      case 0xDC: op = Iop_QAdd8Ux8; break;
+      case 0xDD: op = Iop_QAdd16Ux4; break;
+
+      case 0xF8: op = Iop_Sub8x8;  break;
+      case 0xF9: op = Iop_Sub16x4; break;
+      case 0xFA: op = Iop_Sub32x2; break;
+
+      case 0xE8: op = Iop_QSub8Sx8; break;
+      case 0xE9: op = Iop_QSub16Sx4; break;
+
+      case 0xD8: op = Iop_QSub8Ux8; break;
+      case 0xD9: op = Iop_QSub16Ux4; break;
+
+      case 0xE5: op = Iop_MulHi16Sx4; break;
+      case 0xD5: op = Iop_Mul16x4; break;
+      case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
+
+      case 0x74: op = Iop_CmpEQ8x8; break;
+      case 0x75: op = Iop_CmpEQ16x4; break;
+      case 0x76: op = Iop_CmpEQ32x2; break;
+
+      case 0x64: op = Iop_CmpGT8Sx8; break;
+      case 0x65: op = Iop_CmpGT16Sx4; break;
+      case 0x66: op = Iop_CmpGT32Sx2; break;
+
+      case 0x6B: op = Iop_QNarrow32Sx2; eLeft = True; break;
+      case 0x63: op = Iop_QNarrow16Sx4; eLeft = True; break;
+      case 0x67: op = Iop_QNarrow16Ux4; eLeft = True; break;
+
+      case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
+      case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
+      case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
+
+      case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
+      case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
+      case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
+
+      case 0xDB: op = Iop_And64; break;
+      case 0xDF: op = Iop_And64; invG = True; break;
+      case 0xEB: op = Iop_Or64; break;
+      case 0xEF: /* Possibly do better here if argL and argR are the
+                    same reg */
+                 op = Iop_Xor64; break;
+
+      /* Introduced in SSE1 */
+      case 0xE0: op = Iop_Avg8Ux8;    break;
+      case 0xE3: op = Iop_Avg16Ux4;   break;
+      case 0xEE: op = Iop_Max16Sx4;   break;
+      case 0xDE: op = Iop_Max8Ux8;    break;
+      case 0xEA: op = Iop_Min16Sx4;   break;
+      case 0xDA: op = Iop_Min8Ux8;    break;
+      case 0xE4: op = Iop_MulHi16Ux4; break;
+      case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
+
+      /* Introduced in SSE2 */
+      case 0xD4: op = Iop_Add64; break;
+      case 0xFB: op = Iop_Sub64; break;
+
+      default: 
+         vex_printf("\n0x%x\n", (Int)opc);
+         vpanic("dis_MMXop_regmem_to_reg");
+   }
+
+#  undef XXX
+
+   argG = getMMXReg(gregLO3ofRM(modrm));
+   if (invG)
+      argG = unop(Iop_Not64, argG);
+
+   if (isReg) {
+      delta++;
+      argE = getMMXReg(eregLO3ofRM(modrm));
+   } else {
+      Int    len;
+      IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+      delta += len;
+      argE = loadLE(Ity_I64, mkexpr(addr));
+   }
+
+   if (eLeft) {
+      argL = argE;
+      argR = argG;
+   } else {
+      argL = argG;
+      argR = argE;
+   }
+
+   if (op != Iop_INVALID) {
+      vassert(hName == NULL);
+      vassert(hAddr == NULL);
+      assign(res, binop(op, argL, argR));
+   } else {
+      vassert(hName != NULL);
+      vassert(hAddr != NULL);
+      assign( res, 
+              mkIRExprCCall(
+                 Ity_I64, 
+                 0/*regparms*/, hName, hAddr,
+                 mkIRExprVec_2( argL, argR )
+              ) 
+            );
+   }
+
+   putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
+
+   DIP("%s%s %s, %s\n", 
+       name, show_granularity ? nameMMXGran(opc & 3) : "",
+       ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
+       nameMMXReg(gregLO3ofRM(modrm)) );
+
+   return delta;
+}
+
+
+/* Vector by scalar shift of G by the amount specified at the bottom
+   of E.  This is a straight copy of dis_SSE_shiftG_byE. */
+
+static ULong dis_MMX_shiftG_byE ( VexAbiInfo* vbi,
+                                  Prefix pfx, Long delta, 
+                                  HChar* opname, IROp op )
+{
+   HChar   dis_buf[50];
+   Int     alen, size;
+   IRTemp  addr;
+   Bool    shl, shr, sar;
+   UChar   rm   = getUChar(delta);
+   IRTemp  g0   = newTemp(Ity_I64);
+   IRTemp  g1   = newTemp(Ity_I64);
+   IRTemp  amt  = newTemp(Ity_I64);
+   IRTemp  amt8 = newTemp(Ity_I8);
+
+   if (epartIsReg(rm)) {
+      assign( amt, getMMXReg(eregLO3ofRM(rm)) );
+      DIP("%s %s,%s\n", opname,
+                        nameMMXReg(eregLO3ofRM(rm)),
+                        nameMMXReg(gregLO3ofRM(rm)) );
+      delta++;
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameMMXReg(gregLO3ofRM(rm)) );
+      delta += alen;
+   }
+   assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
+   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
+
+   shl = shr = sar = False;
+   size = 0;
+   switch (op) {
+      case Iop_ShlN16x4: shl = True; size = 32; break;
+      case Iop_ShlN32x2: shl = True; size = 32; break;
+      case Iop_Shl64:    shl = True; size = 64; break;
+      case Iop_ShrN16x4: shr = True; size = 16; break;
+      case Iop_ShrN32x2: shr = True; size = 32; break;
+      case Iop_Shr64:    shr = True; size = 64; break;
+      case Iop_SarN16x4: sar = True; size = 16; break;
+      case Iop_SarN32x2: sar = True; size = 32; break;
+      default: vassert(0);
+   }
+
+   if (shl || shr) {
+     assign( 
+        g1,
+        IRExpr_Mux0X(
+           unop(Iop_1Uto8,binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size))),
+           mkU64(0),
+           binop(op, mkexpr(g0), mkexpr(amt8))
+        )
+     );
+   } else 
+   if (sar) {
+     assign( 
+        g1,
+        IRExpr_Mux0X(
+           unop(Iop_1Uto8,binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size))),
+           binop(op, mkexpr(g0), mkU8(size-1)),
+           binop(op, mkexpr(g0), mkexpr(amt8))
+        )
+     );
+   } else {
+      vassert(0);
+   }
+
+   putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
+   return delta;
+}
+
+
+/* Vector by scalar shift of E by an immediate byte.  This is a
+   straight copy of dis_SSE_shiftE_imm. */
+
+static 
+ULong dis_MMX_shiftE_imm ( Long delta, HChar* opname, IROp op )
+{
+   Bool    shl, shr, sar;
+   UChar   rm   = getUChar(delta);
+   IRTemp  e0   = newTemp(Ity_I64);
+   IRTemp  e1   = newTemp(Ity_I64);
+   UChar   amt, size;
+   vassert(epartIsReg(rm));
+   vassert(gregLO3ofRM(rm) == 2 
+           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
+   amt = getUChar(delta+1);
+   delta += 2;
+   DIP("%s $%d,%s\n", opname,
+                      (Int)amt,
+                      nameMMXReg(eregLO3ofRM(rm)) );
+
+   assign( e0, getMMXReg(eregLO3ofRM(rm)) );
+
+   shl = shr = sar = False;
+   size = 0;
+   switch (op) {
+      case Iop_ShlN16x4: shl = True; size = 16; break;
+      case Iop_ShlN32x2: shl = True; size = 32; break;
+      case Iop_Shl64:    shl = True; size = 64; break;
+      case Iop_SarN16x4: sar = True; size = 16; break;
+      case Iop_SarN32x2: sar = True; size = 32; break;
+      case Iop_ShrN16x4: shr = True; size = 16; break;
+      case Iop_ShrN32x2: shr = True; size = 32; break;
+      case Iop_Shr64:    shr = True; size = 64; break;
+      default: vassert(0);
+   }
+
+   if (shl || shr) {
+     assign( e1, amt >= size 
+                    ? mkU64(0)
+                    : binop(op, mkexpr(e0), mkU8(amt))
+     );
+   } else 
+   if (sar) {
+     assign( e1, amt >= size 
+                    ? binop(op, mkexpr(e0), mkU8(size-1))
+                    : binop(op, mkexpr(e0), mkU8(amt))
+     );
+   } else {
+      vassert(0);
+   }
+
+   putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
+   return delta;
+}
+
+
+/* Completely handle all MMX instructions except emms. */
+
+static
+ULong dis_MMX ( Bool* decode_ok,
+                VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
+{
+   Int   len;
+   UChar modrm;
+   HChar dis_buf[50];
+   UChar opc = getUChar(delta);
+   delta++;
+
+   /* dis_MMX handles all insns except emms. */
+   do_MMX_preamble();
+
+   switch (opc) {
+
+      case 0x6E: 
+         if (sz == 4) {
+            /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
+            modrm = getUChar(delta);
+            if (epartIsReg(modrm)) {
+               delta++;
+               putMMXReg(
+                  gregLO3ofRM(modrm),
+                  binop( Iop_32HLto64,
+                         mkU32(0),
+                         getIReg32(eregOfRexRM(pfx,modrm)) ) );
+               DIP("movd %s, %s\n", 
+                   nameIReg32(eregOfRexRM(pfx,modrm)), 
+                   nameMMXReg(gregLO3ofRM(modrm)));
+            } else {
+               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+               delta += len;
+               putMMXReg(
+                  gregLO3ofRM(modrm),
+                  binop( Iop_32HLto64,
+                         mkU32(0),
+                         loadLE(Ity_I32, mkexpr(addr)) ) );
+               DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
+            }
+         } 
+         else
+         if (sz == 8) {
+            /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
+            modrm = getUChar(delta);
+            if (epartIsReg(modrm)) {
+               delta++;
+               putMMXReg( gregLO3ofRM(modrm),
+                          getIReg64(eregOfRexRM(pfx,modrm)) );
+               DIP("movd %s, %s\n", 
+                   nameIReg64(eregOfRexRM(pfx,modrm)), 
+                   nameMMXReg(gregLO3ofRM(modrm)));
+            } else {
+               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+               delta += len;
+               putMMXReg( gregLO3ofRM(modrm),
+                          loadLE(Ity_I64, mkexpr(addr)) );
+               DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
+            }
+         }
+         else {
+            goto mmx_decode_failure;
+         }
+         break;
+
+      case 0x7E:
+         if (sz == 4) {
+            /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
+            modrm = getUChar(delta);
+            if (epartIsReg(modrm)) {
+               delta++;
+               putIReg32( eregOfRexRM(pfx,modrm),
+                          unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
+               DIP("movd %s, %s\n", 
+                   nameMMXReg(gregLO3ofRM(modrm)), 
+                   nameIReg32(eregOfRexRM(pfx,modrm)));
+            } else {
+               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+               delta += len;
+               storeLE( mkexpr(addr),
+                        unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
+               DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
+            }
+         }
+         else
+         if (sz == 8) {
+            /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
+            modrm = getUChar(delta);
+            if (epartIsReg(modrm)) {
+               delta++;
+               putIReg64( eregOfRexRM(pfx,modrm),
+                          getMMXReg(gregLO3ofRM(modrm)) );
+               DIP("movd %s, %s\n", 
+                   nameMMXReg(gregLO3ofRM(modrm)), 
+                   nameIReg64(eregOfRexRM(pfx,modrm)));
+            } else {
+               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+               delta += len;
+               storeLE( mkexpr(addr),
+                       getMMXReg(gregLO3ofRM(modrm)) );
+               DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
+            }
+         } else {
+            goto mmx_decode_failure;
+         }
+         break;
+
+      case 0x6F:
+         /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4
+             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx))) 
+            goto mmx_decode_failure;
+         modrm = getUChar(delta);
+         if (epartIsReg(modrm)) {
+            delta++;
+            putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
+            DIP("movq %s, %s\n", 
+                nameMMXReg(eregLO3ofRM(modrm)), 
+                nameMMXReg(gregLO3ofRM(modrm)));
+         } else {
+            IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+            delta += len;
+            putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
+            DIP("movq %s, %s\n", 
+                dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
+         }
+         break;
+
+      case 0x7F:
+         /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
+         if (sz != 4
+             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
+            goto mmx_decode_failure;
+         modrm = getUChar(delta);
+         if (epartIsReg(modrm)) {
+            /* Fall through.  The assembler doesn't appear to generate
+               these. */
+            goto mmx_decode_failure;
+         } else {
+            IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+            delta += len;
+            storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
+            DIP("mov(nt)q %s, %s\n", 
+                nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
+         }
+         break;
+
+      case 0xFC: 
+      case 0xFD: 
+      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
+         break;
+
+      case 0xEC: 
+      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4
+             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
+         break;
+
+      case 0xDC: 
+      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
+         break;
+
+      case 0xF8: 
+      case 0xF9: 
+      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
+         break;
+
+      case 0xE8: 
+      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
+         break;
+
+      case 0xD8: 
+      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
+         break;
+
+      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
+         break;
+
+      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
+         break;
+
+      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
+         vassert(sz == 4);
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
+         break;
+
+      case 0x74: 
+      case 0x75: 
+      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
+         break;
+
+      case 0x64: 
+      case 0x65: 
+      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
+         break;
+
+      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
+         break;
+
+      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
+         break;
+
+      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
+         break;
+
+      case 0x68: 
+      case 0x69: 
+      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4
+             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx))) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
+         break;
+
+      case 0x60: 
+      case 0x61: 
+      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4
+             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx))) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
+         break;
+
+      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
+         break;
+
+      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
+         break;
+
+      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
+         break;
+
+      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
+         break; 
+
+#     define SHIFT_BY_REG(_name,_op)                                     \
+                delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
+                break;
+
+      /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
+      case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
+      case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
+
+      /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
+      case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
+      case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
+
+      /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
+      case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
+
+#     undef SHIFT_BY_REG
+
+      case 0x71: 
+      case 0x72: 
+      case 0x73: {
+         /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
+         UChar byte2, subopc;
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         byte2  = getUChar(delta);      /* amode / sub-opcode */
+         subopc = toUChar( (byte2 >> 3) & 7 );
+
+#        define SHIFT_BY_IMM(_name,_op)                        \
+            do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
+            } while (0)
+
+              if (subopc == 2 /*SRL*/ && opc == 0x71) 
+                  SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
+         else if (subopc == 2 /*SRL*/ && opc == 0x72) 
+                 SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
+         else if (subopc == 2 /*SRL*/ && opc == 0x73) 
+                 SHIFT_BY_IMM("psrlq", Iop_Shr64);
+
+         else if (subopc == 4 /*SAR*/ && opc == 0x71) 
+                 SHIFT_BY_IMM("psraw", Iop_SarN16x4);
+         else if (subopc == 4 /*SAR*/ && opc == 0x72) 
+                 SHIFT_BY_IMM("psrad", Iop_SarN32x2);
+
+         else if (subopc == 6 /*SHL*/ && opc == 0x71) 
+                 SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
+         else if (subopc == 6 /*SHL*/ && opc == 0x72) 
+                  SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
+         else if (subopc == 6 /*SHL*/ && opc == 0x73) 
+                 SHIFT_BY_IMM("psllq", Iop_Shl64);
+
+         else goto mmx_decode_failure;
+
+#        undef SHIFT_BY_IMM
+         break;
+      }
+
+      case 0xF7: {
+         IRTemp addr    = newTemp(Ity_I64);
+         IRTemp regD    = newTemp(Ity_I64);
+         IRTemp regM    = newTemp(Ity_I64);
+         IRTemp mask    = newTemp(Ity_I64);
+         IRTemp olddata = newTemp(Ity_I64);
+         IRTemp newdata = newTemp(Ity_I64);
+
+         modrm = getUChar(delta);
+         if (sz != 4 || (!epartIsReg(modrm)))
+            goto mmx_decode_failure;
+         delta++;
+
+         assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
+         assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
+         assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
+         assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
+         assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
+         assign( newdata, 
+                 binop(Iop_Or64, 
+                       binop(Iop_And64, 
+                             mkexpr(regD), 
+                             mkexpr(mask) ),
+                       binop(Iop_And64, 
+                             mkexpr(olddata),
+                             unop(Iop_Not64, mkexpr(mask)))) );
+         storeLE( mkexpr(addr), mkexpr(newdata) );
+         DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
+                                 nameMMXReg( gregLO3ofRM(modrm) ) );
+         break;
+      }
+
+      /* --- MMX decode failure --- */
+      default:
+      mmx_decode_failure:
+         *decode_ok = False;
+         return delta; /* ignored */
+
+   }
+
+   *decode_ok = True;
+   return delta;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- More misc arithmetic and other obscure insns.        ---*/
+/*------------------------------------------------------------*/
+
+/* Generate base << amt with vacated places filled with stuff
+   from xtra.  amt guaranteed in 0 .. 63. */
+static 
+IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
+{
+   /* if   amt == 0 
+      then base
+      else (base << amt) | (xtra >>u (64-amt))
+   */
+   return
+      IRExpr_Mux0X( 
+         mkexpr(amt), 
+         mkexpr(base),
+         binop(Iop_Or64, 
+               binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
+               binop(Iop_Shr64, mkexpr(xtra), 
+                                binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
+         )
+      );
+}
+
+/* Generate base >>u amt with vacated places filled with stuff
+   from xtra.  amt guaranteed in 0 .. 63. */
+static 
+IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
+{
+   /* if   amt == 0 
+      then base
+      else (base >>u amt) | (xtra << (64-amt))
+   */
+   return
+      IRExpr_Mux0X( 
+         mkexpr(amt), 
+         mkexpr(base),
+         binop(Iop_Or64, 
+               binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
+               binop(Iop_Shl64, mkexpr(xtra), 
+                                binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
+         )
+      );
+}
+
+/* Double length left and right shifts.  Apparently only required in
+   v-size (no b- variant). */
+static
+ULong dis_SHLRD_Gv_Ev ( VexAbiInfo* vbi,
+                        Prefix pfx,
+                        Long delta, UChar modrm,
+                        Int sz,
+                        IRExpr* shift_amt,
+                        Bool amt_is_literal,
+                        HChar* shift_amt_txt,
+                        Bool left_shift )
+{
+   /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
+      for printing it.   And eip on entry points at the modrm byte. */
+   Int len;
+   HChar dis_buf[50];
+
+   IRType ty     = szToITy(sz);
+   IRTemp gsrc   = newTemp(ty);
+   IRTemp esrc   = newTemp(ty);
+   IRTemp addr   = IRTemp_INVALID;
+   IRTemp tmpSH  = newTemp(Ity_I8);
+   IRTemp tmpSS  = newTemp(Ity_I8);
+   IRTemp tmp64  = IRTemp_INVALID;
+   IRTemp res64  = IRTemp_INVALID;
+   IRTemp rss64  = IRTemp_INVALID;
+   IRTemp resTy  = IRTemp_INVALID;
+   IRTemp rssTy  = IRTemp_INVALID;
+   Int    mask   = sz==8 ? 63 : 31;
+
+   vassert(sz == 2 || sz == 4 || sz == 8);
+
+   /* The E-part is the destination; this is shifted.  The G-part
+      supplies bits to be shifted into the E-part, but is not
+      changed.  
+
+      If shifting left, form a double-length word with E at the top
+      and G at the bottom, and shift this left.  The result is then in
+      the high part.
+
+      If shifting right, form a double-length word with G at the top
+      and E at the bottom, and shift this right.  The result is then
+      at the bottom.  */
+
+   /* Fetch the operands. */
+
+   assign( gsrc, getIRegG(sz, pfx, modrm) );
+
+   if (epartIsReg(modrm)) {
+      delta++;
+      assign( esrc, getIRegE(sz, pfx, modrm) );
+      DIP("sh%cd%c %s, %s, %s\n",
+          ( left_shift ? 'l' : 'r' ), nameISize(sz), 
+          shift_amt_txt,
+          nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
+   } else {
+      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 
+                        /* # bytes following amode */
+                        amt_is_literal ? 1 : 0 );
+      delta += len;
+      assign( esrc, loadLE(ty, mkexpr(addr)) );
+      DIP("sh%cd%c %s, %s, %s\n", 
+          ( left_shift ? 'l' : 'r' ), nameISize(sz), 
+          shift_amt_txt,
+          nameIRegG(sz, pfx, modrm), dis_buf);
+   }
+
+   /* Calculate the masked shift amount (tmpSH), the masked subshift
+      amount (tmpSS), the shifted value (res64) and the subshifted
+      value (rss64). */
+
+   assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
+   assign( tmpSS, binop(Iop_And8, 
+                        binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
+                        mkU8(mask)));
+
+   tmp64 = newTemp(Ity_I64);
+   res64 = newTemp(Ity_I64);
+   rss64 = newTemp(Ity_I64);
+
+   if (sz == 2 || sz == 4) {
+
+      /* G is xtra; E is data */
+      /* what a freaking nightmare: */
+      if (sz == 4 && left_shift) {
+         assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
+         assign( res64, 
+                 binop(Iop_Shr64, 
+                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
+                       mkU8(32)) );
+         assign( rss64, 
+                 binop(Iop_Shr64, 
+                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
+                       mkU8(32)) );
+      }
+      else
+      if (sz == 4 && !left_shift) {
+         assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
+         assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
+         assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
+      }
+      else
+      if (sz == 2 && left_shift) {
+         assign( tmp64,
+                 binop(Iop_32HLto64,
+                       binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
+                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
+         ));
+	 /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
+         assign( res64, 
+                 binop(Iop_Shr64, 
+                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
+                       mkU8(48)) );
+         /* subshift formed by shifting [esrc'0000'0000'0000] */
+         assign( rss64, 
+                 binop(Iop_Shr64, 
+                       binop(Iop_Shl64, 
+                             binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
+                                              mkU8(48)),
+                             mkexpr(tmpSS)),
+                       mkU8(48)) );
+      }
+      else
+      if (sz == 2 && !left_shift) {
+         assign( tmp64,
+                 binop(Iop_32HLto64,
+                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
+                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
+         ));
+         /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
+         assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
+         /* subshift formed by shifting [0000'0000'0000'esrc] */
+         assign( rss64, binop(Iop_Shr64, 
+                              unop(Iop_16Uto64, mkexpr(esrc)), 
+                              mkexpr(tmpSS)) );
+      }
+
+   } else {
+
+      vassert(sz == 8);
+      if (left_shift) {
+         assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
+         assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
+      } else {
+         assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
+         assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
+      }
+
+   }
+
+   resTy = newTemp(ty);
+   rssTy = newTemp(ty);
+   assign( resTy, narrowTo(ty, mkexpr(res64)) );
+   assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
+
+   /* Put result back and write the flags thunk. */
+   setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
+                              resTy, rssTy, ty, tmpSH );
+
+   if (epartIsReg(modrm)) {
+      putIRegE(sz, pfx, modrm, mkexpr(resTy));
+   } else {
+      storeLE( mkexpr(addr), mkexpr(resTy) );
+   }
+
+   if (amt_is_literal) delta++;
+   return delta;
+}
+
+
+/* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
+   required. */
+
+typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
+
+static HChar* nameBtOp ( BtOp op )
+{
+   switch (op) {
+      case BtOpNone:  return "";
+      case BtOpSet:   return "s";
+      case BtOpReset: return "r";
+      case BtOpComp:  return "c";
+      default: vpanic("nameBtOp(amd64)");
+   }
+}
+
+
+static
+ULong dis_bt_G_E ( VexAbiInfo* vbi,
+                   Prefix pfx, Int sz, Long delta, BtOp op )
+{
+   HChar  dis_buf[50];
+   UChar  modrm;
+   Int    len;
+   IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0, 
+     t_addr1, t_rsp, t_mask, t_new;
+
+   vassert(sz == 2 || sz == 4 || sz == 8);
+
+   t_fetched = t_bitno0 = t_bitno1 = t_bitno2 
+             = t_addr0 = t_addr1 = t_rsp
+             = t_mask = t_new = IRTemp_INVALID;
+
+   t_fetched = newTemp(Ity_I8);
+   t_new     = newTemp(Ity_I8);
+   t_bitno0  = newTemp(Ity_I64);
+   t_bitno1  = newTemp(Ity_I64);
+   t_bitno2  = newTemp(Ity_I8);
+   t_addr1   = newTemp(Ity_I64);
+   modrm     = getUChar(delta);
+
+   assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
+   
+   if (epartIsReg(modrm)) {
+      delta++;
+      /* Get it onto the client's stack.  Oh, this is a horrible
+         kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
+         Because of the ELF ABI stack redzone, there may be live data
+         up to 128 bytes below %RSP.  So we can't just push it on the
+         stack, else we may wind up trashing live data, and causing
+         impossible-to-find simulation errors.  (Yes, this did
+         happen.)  So we need to drop RSP before at least 128 before
+         pushing it.  That unfortunately means hitting Memcheck's
+         fast-case painting code.  Ideally we should drop more than
+         128, to reduce the chances of breaking buggy programs that
+         have live data below -128(%RSP).  Memcheck fast-cases moves
+         of 288 bytes due to the need to handle ppc64-linux quickly,
+         so let's use 288.  Of course the real fix is to get rid of
+         this kludge entirely.  */
+      t_rsp = newTemp(Ity_I64);
+      t_addr0 = newTemp(Ity_I64);
+
+      vassert(vbi->guest_stack_redzone_size == 128);
+      assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
+      putIReg64(R_RSP, mkexpr(t_rsp));
+
+      storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
+
+      /* Make t_addr0 point at it. */
+      assign( t_addr0, mkexpr(t_rsp) );
+
+      /* Mask out upper bits of the shift amount, since we're doing a
+         reg. */
+      assign( t_bitno1, binop(Iop_And64, 
+                              mkexpr(t_bitno0), 
+                              mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
+
+   } else {
+      t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
+      delta += len;
+      assign( t_bitno1, mkexpr(t_bitno0) );
+   }
+  
+   /* At this point: t_addr0 is the address being operated on.  If it
+      was a reg, we will have pushed it onto the client's stack.
+      t_bitno1 is the bit number, suitably masked in the case of a
+      reg.  */
+  
+   /* Now the main sequence. */
+   assign( t_addr1, 
+           binop(Iop_Add64, 
+                 mkexpr(t_addr0), 
+                 binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
+
+   /* t_addr1 now holds effective address */
+
+   assign( t_bitno2, 
+           unop(Iop_64to8, 
+                binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
+
+   /* t_bitno2 contains offset of bit within byte */
+
+   if (op != BtOpNone) {
+      t_mask = newTemp(Ity_I8);
+      assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
+   }
+
+   /* t_mask is now a suitable byte mask */
+
+   assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
+
+   if (op != BtOpNone) {
+      switch (op) {
+         case BtOpSet:
+            assign( t_new,
+                    binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
+            break;
+         case BtOpComp:
+            assign( t_new,
+                    binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
+            break;
+         case BtOpReset:
+            assign( t_new,
+                    binop(Iop_And8, mkexpr(t_fetched), 
+                                    unop(Iop_Not8, mkexpr(t_mask))) );
+            break;
+         default: 
+            vpanic("dis_bt_G_E(amd64)");
+      }
+      if ((pfx & PFX_LOCK) && !epartIsReg(modrm)) {
+         casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
+                                 mkexpr(t_new)/*new*/,
+                                 guest_RIP_curr_instr );
+      } else {
+         storeLE( mkexpr(t_addr1), mkexpr(t_new) );
+      }
+   }
+  
+   /* Side effect done; now get selected bit into Carry flag */
+   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+   stmt( IRStmt_Put( 
+            OFFB_CC_DEP1,
+            binop(Iop_And64,
+                  binop(Iop_Shr64, 
+                        unop(Iop_8Uto64, mkexpr(t_fetched)),
+                        mkexpr(t_bitno2)),
+                  mkU64(1)))
+       );
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+
+   /* Move reg operand from stack back to reg */
+   if (epartIsReg(modrm)) {
+      /* t_rsp still points at it. */
+      /* only write the reg if actually modifying it; doing otherwise
+         zeroes the top half erroneously when doing btl due to
+         standard zero-extend rule */
+      if (op != BtOpNone)
+         putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
+      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
+   }
+
+   DIP("bt%s%c %s, %s\n",
+       nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm), 
+       ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
+ 
+   return delta;
+}
+
+
+
+/* Handle BSF/BSR.  Only v-size seems necessary. */
+static
+ULong dis_bs_E_G ( VexAbiInfo* vbi,
+                   Prefix pfx, Int sz, Long delta, Bool fwds )
+{
+   Bool   isReg;
+   UChar  modrm;
+   HChar  dis_buf[50];
+
+   IRType ty    = szToITy(sz);
+   IRTemp src   = newTemp(ty);
+   IRTemp dst   = newTemp(ty);
+   IRTemp src64 = newTemp(Ity_I64);
+   IRTemp dst64 = newTemp(Ity_I64);
+   IRTemp src8  = newTemp(Ity_I8);
+
+   vassert(sz == 8 || sz == 4 || sz == 2);
+
+   modrm = getUChar(delta);
+   isReg = epartIsReg(modrm);
+   if (isReg) {
+      delta++;
+      assign( src, getIRegE(sz, pfx, modrm) );
+   } else {
+      Int    len;
+      IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
+      delta += len;
+      assign( src, loadLE(ty, mkexpr(addr)) );
+   }
+
+   DIP("bs%c%c %s, %s\n",
+       fwds ? 'f' : 'r', nameISize(sz), 
+       ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ), 
+       nameIRegG(sz, pfx, modrm));
+
+   /* First, widen src to 64 bits if it is not already. */
+   assign( src64, widenUto64(mkexpr(src)) );
+
+   /* Generate an 8-bit expression which is zero iff the 
+      original is zero, and nonzero otherwise */
+   assign( src8,
+           unop(Iop_1Uto8, 
+                binop(Iop_CmpNE64,
+                      mkexpr(src64), mkU64(0))) );
+
+   /* Flags: Z is 1 iff source value is zero.  All others 
+      are undefined -- we force them to zero. */
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+   stmt( IRStmt_Put( 
+            OFFB_CC_DEP1,
+            IRExpr_Mux0X( mkexpr(src8),
+                          /* src==0 */
+                          mkU64(AMD64G_CC_MASK_Z),
+                          /* src!=0 */
+                          mkU64(0)
+                        )
+       ));
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+
+   /* Result: iff source value is zero, we can't use
+      Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
+      But anyway, amd64 semantics say the result is undefined in
+      such situations.  Hence handle the zero case specially. */
+
+   /* Bleh.  What we compute:
+
+          bsf64:  if src == 0 then {dst is unchanged} 
+                              else Ctz64(src)
+
+          bsr64:  if src == 0 then {dst is unchanged} 
+                              else 63 - Clz64(src)
+
+          bsf32:  if src == 0 then {dst is unchanged} 
+                              else Ctz64(32Uto64(src))
+
+          bsr32:  if src == 0 then {dst is unchanged}
+                              else 63 - Clz64(32Uto64(src))
+
+          bsf16:  if src == 0 then {dst is unchanged} 
+                              else Ctz64(32Uto64(16Uto32(src)))
+
+          bsr16:  if src == 0 then {dst is unchanged} 
+                              else 63 - Clz64(32Uto64(16Uto32(src)))
+   */
+
+   /* The main computation, guarding against zero. */
+   assign( dst64,
+           IRExpr_Mux0X( 
+              mkexpr(src8),
+              /* src == 0 -- leave dst unchanged */
+              widenUto64( getIRegG( sz, pfx, modrm ) ),
+              /* src != 0 */
+              fwds ? unop(Iop_Ctz64, mkexpr(src64))
+                   : binop(Iop_Sub64, 
+                           mkU64(63), 
+                           unop(Iop_Clz64, mkexpr(src64)))
+           )
+         );
+
+   if (sz == 2)
+      assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
+   else
+   if (sz == 4)
+      assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
+   else
+      assign( dst, mkexpr(dst64) );
+
+   /* dump result back */
+   putIRegG( sz, pfx, modrm, mkexpr(dst) );
+
+   return delta;
+}
+
+
+/* swap rAX with the reg specified by reg and REX.B */
+static 
+void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
+{
+   IRType ty = szToITy(sz);
+   IRTemp t1 = newTemp(ty);
+   IRTemp t2 = newTemp(ty);
+   vassert(sz == 4 || sz == 8);
+   vassert(regLo3 < 8);
+   if (sz == 8) {
+      assign( t1, getIReg64(R_RAX) );
+      assign( t2, getIRegRexB(8, pfx, regLo3) );
+      putIReg64( R_RAX, mkexpr(t2) );
+      putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
+   } else {
+      assign( t1, getIReg32(R_RAX) );
+      assign( t2, getIRegRexB(4, pfx, regLo3) );
+      putIReg32( R_RAX, mkexpr(t2) );
+      putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
+   }
+   DIP("xchg%c %s, %s\n", 
+       nameISize(sz), nameIRegRAX(sz), 
+                      nameIRegRexB(sz,pfx, regLo3));
+}
+
+
+static 
+void codegen_SAHF ( void )
+{
+   /* Set the flags to:
+      (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O) 
+                                    -- retain the old O flag
+      | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
+                |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
+   */
+   ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
+                       |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
+   IRTemp oldflags   = newTemp(Ity_I64);
+   assign( oldflags, mk_amd64g_calculate_rflags_all() );
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP1,
+         binop(Iop_Or64,
+               binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
+               binop(Iop_And64, 
+                     binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
+                     mkU64(mask_SZACP))
+              )
+   ));
+}
+
+
+static 
+void codegen_LAHF ( void  )
+{
+   /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
+   IRExpr* rax_with_hole;
+   IRExpr* new_byte;
+   IRExpr* new_rax;
+   ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
+                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
+
+   IRTemp  flags = newTemp(Ity_I64);
+   assign( flags, mk_amd64g_calculate_rflags_all() );
+
+   rax_with_hole 
+      = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
+   new_byte 
+      = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
+                        mkU64(1<<1));
+   new_rax 
+      = binop(Iop_Or64, rax_with_hole,
+                        binop(Iop_Shl64, new_byte, mkU8(8)));
+   putIReg64(R_RAX, new_rax);
+}
+
+
+static
+ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
+                        VexAbiInfo*  vbi,
+                        Prefix       pfx,
+                        Int          size, 
+                        Long         delta0 )
+{
+   HChar dis_buf[50];
+   Int   len;
+
+   IRType ty    = szToITy(size);
+   IRTemp acc   = newTemp(ty);
+   IRTemp src   = newTemp(ty);
+   IRTemp dest  = newTemp(ty);
+   IRTemp dest2 = newTemp(ty);
+   IRTemp acc2  = newTemp(ty);
+   IRTemp cond8 = newTemp(Ity_I8);
+   IRTemp addr  = IRTemp_INVALID;
+   UChar  rm    = getUChar(delta0);
+
+   /* There are 3 cases to consider:
+
+      reg-reg: ignore any lock prefix, generate sequence based
+               on Mux0X
+
+      reg-mem, not locked: ignore any lock prefix, generate sequence
+                           based on Mux0X
+
+      reg-mem, locked: use IRCAS
+   */
+
+   if (epartIsReg(rm)) {
+      /* case 1 */
+      assign( dest, getIRegE(size, pfx, rm) );
+      delta0++;
+      assign( src, getIRegG(size, pfx, rm) );
+      assign( acc, getIRegRAX(size) );
+      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
+      assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
+      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
+      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
+      putIRegRAX(size, mkexpr(acc2));
+      putIRegE(size, pfx, rm, mkexpr(dest2));
+      DIP("cmpxchg%c %s,%s\n", nameISize(size),
+                               nameIRegG(size,pfx,rm),
+                               nameIRegE(size,pfx,rm) );
+   } 
+   else if (!epartIsReg(rm) && !(pfx & PFX_LOCK)) {
+      /* case 2 */
+      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
+      assign( dest, loadLE(ty, mkexpr(addr)) );
+      delta0 += len;
+      assign( src, getIRegG(size, pfx, rm) );
+      assign( acc, getIRegRAX(size) );
+      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
+      assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
+      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
+      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
+      putIRegRAX(size, mkexpr(acc2));
+      storeLE( mkexpr(addr), mkexpr(dest2) );
+      DIP("cmpxchg%c %s,%s\n", nameISize(size), 
+                               nameIRegG(size,pfx,rm), dis_buf);
+   }
+   else if (!epartIsReg(rm) && (pfx & PFX_LOCK)) {
+      /* case 3 */
+      /* src is new value.  acc is expected value.  dest is old value.
+         Compute success from the output of the IRCAS, and steer the
+         new value for RAX accordingly: in case of success, RAX is
+         unchanged. */
+      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
+      delta0 += len;
+      assign( src, getIRegG(size, pfx, rm) );
+      assign( acc, getIRegRAX(size) );
+      stmt( IRStmt_CAS( 
+         mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr), 
+                  NULL, mkexpr(acc), NULL, mkexpr(src) )
+      ));
+      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
+      assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
+      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
+      putIRegRAX(size, mkexpr(acc2));
+      DIP("cmpxchg%c %s,%s\n", nameISize(size), 
+                               nameIRegG(size,pfx,rm), dis_buf);
+   }
+   else vassert(0);
+
+   *ok = True;
+   return delta0;
+}
+
+
+/* Handle conditional move instructions of the form
+      cmovcc E(reg-or-mem), G(reg)
+
+   E(src) is reg-or-mem
+   G(dst) is reg.
+
+   If E is reg, -->    GET %E, tmps
+                       GET %G, tmpd
+                       CMOVcc tmps, tmpd
+                       PUT tmpd, %G
+ 
+   If E is mem  -->    (getAddr E) -> tmpa
+                       LD (tmpa), tmps
+                       GET %G, tmpd
+                       CMOVcc tmps, tmpd
+                       PUT tmpd, %G
+*/
+static
+ULong dis_cmov_E_G ( VexAbiInfo* vbi,
+                     Prefix        pfx,
+                     Int           sz, 
+                     AMD64Condcode cond,
+                     Long          delta0 )
+{
+   UChar rm  = getUChar(delta0);
+   HChar dis_buf[50];
+   Int   len;
+
+   IRType ty   = szToITy(sz);
+   IRTemp tmps = newTemp(ty);
+   IRTemp tmpd = newTemp(ty);
+
+   if (epartIsReg(rm)) {
+      assign( tmps, getIRegE(sz, pfx, rm) );
+      assign( tmpd, getIRegG(sz, pfx, rm) );
+
+      putIRegG( sz, pfx, rm,
+                IRExpr_Mux0X( unop(Iop_1Uto8,
+                                   mk_amd64g_calculate_condition(cond)),
+                              mkexpr(tmpd),
+                              mkexpr(tmps) )
+              );
+      DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
+                            nameIRegE(sz,pfx,rm),
+                            nameIRegG(sz,pfx,rm));
+      return 1+delta0;
+   }
+
+   /* E refers to memory */    
+   {
+      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
+      assign( tmps, loadLE(ty, mkexpr(addr)) );
+      assign( tmpd, getIRegG(sz, pfx, rm) );
+
+      putIRegG( sz, pfx, rm,
+                IRExpr_Mux0X( unop(Iop_1Uto8,
+                                   mk_amd64g_calculate_condition(cond)),
+                              mkexpr(tmpd),
+                              mkexpr(tmps) )
+              );
+
+      DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
+                            dis_buf,
+                            nameIRegG(sz,pfx,rm));
+      return len+delta0;
+   }
+}
+
+
+static
+ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
+                     VexAbiInfo* vbi,
+                     Prefix pfx, Int sz, Long delta0 )
+{
+   Int   len;
+   UChar rm = getUChar(delta0);
+   HChar dis_buf[50];
+
+   IRType ty    = szToITy(sz);
+   IRTemp tmpd  = newTemp(ty);
+   IRTemp tmpt0 = newTemp(ty);
+   IRTemp tmpt1 = newTemp(ty);
+
+   /* There are 3 cases to consider:
+
+      reg-reg: ignore any lock prefix,
+               generate 'naive' (non-atomic) sequence
+
+      reg-mem, not locked: ignore any lock prefix, generate 'naive'
+                           (non-atomic) sequence
+
+      reg-mem, locked: use IRCAS
+   */
+
+   if (epartIsReg(rm)) {
+      /* case 1 */
+      assign( tmpd, getIRegE(sz, pfx, rm) );
+      assign( tmpt0, getIRegG(sz, pfx, rm) );
+      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
+                           mkexpr(tmpd), mkexpr(tmpt0)) );
+      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
+      putIRegG(sz, pfx, rm, mkexpr(tmpd));
+      putIRegE(sz, pfx, rm, mkexpr(tmpt1));
+      DIP("xadd%c %s, %s\n",
+          nameISize(sz), nameIRegG(sz,pfx,rm),
+          				 nameIRegE(sz,pfx,rm));
+      *decode_ok = True;
+      return 1+delta0;
+   }
+   else if (!epartIsReg(rm) && !(pfx & PFX_LOCK)) {
+      /* case 2 */
+      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
+      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
+      assign( tmpt0, getIRegG(sz, pfx, rm) );
+      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
+                           mkexpr(tmpd), mkexpr(tmpt0)) );
+      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
+      storeLE( mkexpr(addr), mkexpr(tmpt1) );
+      putIRegG(sz, pfx, rm, mkexpr(tmpd));
+      DIP("xadd%c %s, %s\n",
+          nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
+      *decode_ok = True;
+      return len+delta0;
+   }
+   else if (!epartIsReg(rm) && (pfx & PFX_LOCK)) {
+      /* case 3 */
+      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
+      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
+      assign( tmpt0, getIRegG(sz, pfx, rm) );
+      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8), 
+                           mkexpr(tmpd), mkexpr(tmpt0)) );
+      casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
+                           mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
+      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
+      putIRegG(sz, pfx, rm, mkexpr(tmpd));
+      DIP("xadd%c %s, %s\n",
+          nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
+      *decode_ok = True;
+      return len+delta0;
+   }
+   /*UNREACHED*/
+   vassert(0);
+}
+
+//.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
+//.. 
+//.. static
+//.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
+//.. {
+//..    Int    len;
+//..    IRTemp addr;
+//..    UChar  rm  = getUChar(delta0);
+//..    HChar  dis_buf[50];
+//.. 
+//..    if (epartIsReg(rm)) {
+//..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
+//..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
+//..       return 1+delta0;
+//..    } else {
+//..       addr = disAMode ( &len, sorb, delta0, dis_buf );
+//..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
+//..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
+//..       return len+delta0;
+//..    }
+//.. }
+//.. 
+//.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
+//..    dst is ireg and sz==4, zero out top half of it.  */
+//.. 
+//.. static
+//.. UInt dis_mov_Sw_Ew ( UChar sorb,
+//..                      Int   sz,
+//..                      UInt  delta0 )
+//.. {
+//..    Int    len;
+//..    IRTemp addr;
+//..    UChar  rm  = getUChar(delta0);
+//..    HChar  dis_buf[50];
+//.. 
+//..    vassert(sz == 2 || sz == 4);
+//.. 
+//..    if (epartIsReg(rm)) {
+//..       if (sz == 4)
+//..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
+//..       else
+//..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
+//.. 
+//..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
+//..       return 1+delta0;
+//..    } else {
+//..       addr = disAMode ( &len, sorb, delta0, dis_buf );
+//..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
+//..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
+//..       return len+delta0;
+//..    }
+//.. }
+//.. 
+//.. 
+//.. static 
+//.. void dis_push_segreg ( UInt sreg, Int sz )
+//.. {
+//..     IRTemp t1 = newTemp(Ity_I16);
+//..     IRTemp ta = newTemp(Ity_I32);
+//..     vassert(sz == 2 || sz == 4);
+//.. 
+//..     assign( t1, getSReg(sreg) );
+//..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
+//..     putIReg(4, R_ESP, mkexpr(ta));
+//..     storeLE( mkexpr(ta), mkexpr(t1) );
+//.. 
+//..     DIP("pushw %s\n", nameSReg(sreg));
+//.. }
+//.. 
+//.. static
+//.. void dis_pop_segreg ( UInt sreg, Int sz )
+//.. {
+//..     IRTemp t1 = newTemp(Ity_I16);
+//..     IRTemp ta = newTemp(Ity_I32);
+//..     vassert(sz == 2 || sz == 4);
+//.. 
+//..     assign( ta, getIReg(4, R_ESP) );
+//..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
+//.. 
+//..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
+//..     putSReg( sreg, mkexpr(t1) );
+//..     DIP("pop %s\n", nameSReg(sreg));
+//.. }
+
+static
+void dis_ret ( VexAbiInfo* vbi, ULong d64 )
+{
+   IRTemp t1 = newTemp(Ity_I64); 
+   IRTemp t2 = newTemp(Ity_I64);
+   IRTemp t3 = newTemp(Ity_I64);
+   assign(t1, getIReg64(R_RSP));
+   assign(t2, loadLE(Ity_I64,mkexpr(t1)));
+   assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
+   putIReg64(R_RSP, mkexpr(t3));
+   make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
+   jmp_treg(Ijk_Ret,t2);
+}
+
+
+/*------------------------------------------------------------*/
+/*--- SSE/SSE2/SSE3 helpers                                ---*/
+/*------------------------------------------------------------*/
+
+/* Worker function; do not call directly. 
+   Handles full width G = G `op` E   and   G = (not G) `op` E.
+*/
+
+static ULong dis_SSE_E_to_G_all_wrk ( 
+                VexAbiInfo* vbi,
+                Prefix pfx, Long delta, 
+                HChar* opname, IROp op,
+                Bool   invertG
+             )
+{
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getUChar(delta);
+   IRExpr* gpart
+      = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
+                : getXMMReg(gregOfRexRM(pfx,rm));
+   if (epartIsReg(rm)) {
+      putXMMReg( gregOfRexRM(pfx,rm), 
+                 binop(op, gpart,
+                           getXMMReg(eregOfRexRM(pfx,rm))) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRexRM(pfx,rm)),
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      return delta+1;
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      putXMMReg( gregOfRexRM(pfx,rm), 
+                 binop(op, gpart,
+                           loadLE(Ity_V128, mkexpr(addr))) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      return delta+alen;
+   }
+}
+
+
+/* All lanes SSE binary operation, G = G `op` E. */
+
+static
+ULong dis_SSE_E_to_G_all ( VexAbiInfo* vbi,
+                           Prefix pfx, Long delta, 
+                           HChar* opname, IROp op )
+{
+   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
+}
+
+/* All lanes SSE binary operation, G = (not G) `op` E. */
+
+static
+ULong dis_SSE_E_to_G_all_invG ( VexAbiInfo* vbi,
+                                Prefix pfx, Long delta, 
+                                HChar* opname, IROp op )
+{
+   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
+}
+
+
+/* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
+
+static ULong dis_SSE_E_to_G_lo32 ( VexAbiInfo* vbi,
+                                   Prefix pfx, Long delta, 
+                                   HChar* opname, IROp op )
+{
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getUChar(delta);
+   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
+   if (epartIsReg(rm)) {
+      putXMMReg( gregOfRexRM(pfx,rm), 
+                 binop(op, gpart,
+                           getXMMReg(eregOfRexRM(pfx,rm))) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRexRM(pfx,rm)),
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      return delta+1;
+   } else {
+      /* We can only do a 32-bit memory read, so the upper 3/4 of the
+         E operand needs to be made simply of zeroes. */
+      IRTemp epart = newTemp(Ity_V128);
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( epart, unop( Iop_32UtoV128,
+                           loadLE(Ity_I32, mkexpr(addr))) );
+      putXMMReg( gregOfRexRM(pfx,rm), 
+                 binop(op, gpart, mkexpr(epart)) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      return delta+alen;
+   }
+}
+
+
+/* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
+
+static ULong dis_SSE_E_to_G_lo64 ( VexAbiInfo* vbi,
+                                   Prefix pfx, Long delta, 
+                                   HChar* opname, IROp op )
+{
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getUChar(delta);
+   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
+   if (epartIsReg(rm)) {
+      putXMMReg( gregOfRexRM(pfx,rm), 
+                 binop(op, gpart,
+                           getXMMReg(eregOfRexRM(pfx,rm))) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRexRM(pfx,rm)),
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      return delta+1;
+   } else {
+      /* We can only do a 64-bit memory read, so the upper half of the
+         E operand needs to be made simply of zeroes. */
+      IRTemp epart = newTemp(Ity_V128);
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( epart, unop( Iop_64UtoV128,
+                           loadLE(Ity_I64, mkexpr(addr))) );
+      putXMMReg( gregOfRexRM(pfx,rm), 
+                 binop(op, gpart, mkexpr(epart)) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      return delta+alen;
+   }
+}
+
+
+/* All lanes unary SSE operation, G = op(E). */
+
+static ULong dis_SSE_E_to_G_unary_all ( 
+                VexAbiInfo* vbi,
+                Prefix pfx, Long delta, 
+                HChar* opname, IROp op
+             )
+{
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getUChar(delta);
+   if (epartIsReg(rm)) {
+      putXMMReg( gregOfRexRM(pfx,rm), 
+                 unop(op, getXMMReg(eregOfRexRM(pfx,rm))) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRexRM(pfx,rm)),
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      return delta+1;
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      putXMMReg( gregOfRexRM(pfx,rm), 
+                 unop(op, loadLE(Ity_V128, mkexpr(addr))) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      return delta+alen;
+   }
+}
+
+
+/* Lowest 32-bit lane only unary SSE operation, G = op(E). */
+
+static ULong dis_SSE_E_to_G_unary_lo32 ( 
+                VexAbiInfo* vbi,
+                Prefix pfx, Long delta, 
+                HChar* opname, IROp op
+             )
+{
+   /* First we need to get the old G value and patch the low 32 bits
+      of the E operand into it.  Then apply op and write back to G. */
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getUChar(delta);
+   IRTemp  oldG0 = newTemp(Ity_V128);
+   IRTemp  oldG1 = newTemp(Ity_V128);
+
+   assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
+
+   if (epartIsReg(rm)) {
+      assign( oldG1, 
+              binop( Iop_SetV128lo32,
+                     mkexpr(oldG0),
+                     getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
+      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRexRM(pfx,rm)),
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      return delta+1;
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( oldG1, 
+              binop( Iop_SetV128lo32,
+                     mkexpr(oldG0),
+                     loadLE(Ity_I32, mkexpr(addr)) ));
+      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      return delta+alen;
+   }
+}
+
+
+/* Lowest 64-bit lane only unary SSE operation, G = op(E). */
+
+static ULong dis_SSE_E_to_G_unary_lo64 ( 
+                VexAbiInfo* vbi,
+                Prefix pfx, Long delta, 
+                HChar* opname, IROp op
+             )
+{
+   /* First we need to get the old G value and patch the low 64 bits
+      of the E operand into it.  Then apply op and write back to G. */
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getUChar(delta);
+   IRTemp  oldG0 = newTemp(Ity_V128);
+   IRTemp  oldG1 = newTemp(Ity_V128);
+
+   assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
+
+   if (epartIsReg(rm)) {
+      assign( oldG1, 
+              binop( Iop_SetV128lo64,
+                     mkexpr(oldG0),
+                     getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
+      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRexRM(pfx,rm)),
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      return delta+1;
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( oldG1, 
+              binop( Iop_SetV128lo64,
+                     mkexpr(oldG0),
+                     loadLE(Ity_I64, mkexpr(addr)) ));
+      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      return delta+alen;
+   }
+}
+
+
+/* SSE integer binary operation:
+      G = G `op` E   (eLeft == False)
+      G = E `op` G   (eLeft == True)
+*/
+static ULong dis_SSEint_E_to_G( 
+                VexAbiInfo* vbi,
+                Prefix pfx, Long delta, 
+                HChar* opname, IROp op,
+                Bool   eLeft
+             )
+{
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getUChar(delta);
+   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
+   IRExpr* epart = NULL;
+   if (epartIsReg(rm)) {
+      epart = getXMMReg(eregOfRexRM(pfx,rm));
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRexRM(pfx,rm)),
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      delta += 1;
+   } else {
+      addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      epart = loadLE(Ity_V128, mkexpr(addr));
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      delta += alen;
+   }
+   putXMMReg( gregOfRexRM(pfx,rm), 
+              eLeft ? binop(op, epart, gpart)
+	            : binop(op, gpart, epart) );
+   return delta;
+}
+
+
+/* Helper for doing SSE FP comparisons. */
+
+static void findSSECmpOp ( Bool* needNot, IROp* op, 
+                           Int imm8, Bool all_lanes, Int sz )
+{
+   imm8 &= 7;
+   *needNot = False;
+   *op      = Iop_INVALID;
+   if (imm8 >= 4) {
+      *needNot = True;
+      imm8 -= 4;
+   }
+
+   if (sz == 4 && all_lanes) {
+      switch (imm8) {
+         case 0: *op = Iop_CmpEQ32Fx4; return;
+         case 1: *op = Iop_CmpLT32Fx4; return;
+         case 2: *op = Iop_CmpLE32Fx4; return;
+         case 3: *op = Iop_CmpUN32Fx4; return;
+         default: break;
+      }
+   }
+   if (sz == 4 && !all_lanes) {
+      switch (imm8) {
+         case 0: *op = Iop_CmpEQ32F0x4; return;
+         case 1: *op = Iop_CmpLT32F0x4; return;
+         case 2: *op = Iop_CmpLE32F0x4; return;
+         case 3: *op = Iop_CmpUN32F0x4; return;
+         default: break;
+      }
+   }
+   if (sz == 8 && all_lanes) {
+      switch (imm8) {
+         case 0: *op = Iop_CmpEQ64Fx2; return;
+         case 1: *op = Iop_CmpLT64Fx2; return;
+         case 2: *op = Iop_CmpLE64Fx2; return;
+         case 3: *op = Iop_CmpUN64Fx2; return;
+         default: break;
+      }
+   }
+   if (sz == 8 && !all_lanes) {
+      switch (imm8) {
+         case 0: *op = Iop_CmpEQ64F0x2; return;
+         case 1: *op = Iop_CmpLT64F0x2; return;
+         case 2: *op = Iop_CmpLE64F0x2; return;
+         case 3: *op = Iop_CmpUN64F0x2; return;
+         default: break;
+      }
+   }
+   vpanic("findSSECmpOp(amd64,guest)");
+}
+
+/* Handles SSE 32F/64F comparisons. */
+
+static ULong dis_SSEcmp_E_to_G ( VexAbiInfo* vbi,
+                                 Prefix pfx, Long delta, 
+                                 HChar* opname, Bool all_lanes, Int sz )
+{
+   HChar   dis_buf[50];
+   Int     alen, imm8;
+   IRTemp  addr;
+   Bool    needNot = False;
+   IROp    op      = Iop_INVALID;
+   IRTemp  plain   = newTemp(Ity_V128);
+   UChar   rm      = getUChar(delta);
+   UShort  mask    = 0;
+   vassert(sz == 4 || sz == 8);
+   if (epartIsReg(rm)) {
+      imm8 = getUChar(delta+1);
+      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
+      assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)), 
+                               getXMMReg(eregOfRexRM(pfx,rm))) );
+      delta += 2;
+      DIP("%s $%d,%s,%s\n", opname,
+                            (Int)imm8,
+                            nameXMMReg(eregOfRexRM(pfx,rm)),
+                            nameXMMReg(gregOfRexRM(pfx,rm)) );
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
+      imm8 = getUChar(delta+alen);
+      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
+      assign( plain, 
+              binop(
+                 op,
+                 getXMMReg(gregOfRexRM(pfx,rm)), 
+                   all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
+                 : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
+                 : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
+	      ) 
+      );
+      delta += alen+1;
+      DIP("%s $%d,%s,%s\n", opname,
+                            (Int)imm8,
+                            dis_buf,
+                            nameXMMReg(gregOfRexRM(pfx,rm)) );
+   }
+
+   if (needNot && all_lanes) {
+      putXMMReg( gregOfRexRM(pfx,rm), 
+                 unop(Iop_NotV128, mkexpr(plain)) );
+   }
+   else
+   if (needNot && !all_lanes) {
+      mask = toUShort(sz==4 ? 0x000F : 0x00FF);
+      putXMMReg( gregOfRexRM(pfx,rm), 
+                 binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
+   }
+   else {
+      putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
+   }
+
+   return delta;
+}
+
+
+/* Vector by scalar shift of G by the amount specified at the bottom
+   of E. */
+
+static ULong dis_SSE_shiftG_byE ( VexAbiInfo* vbi,
+                                  Prefix pfx, Long delta, 
+                                  HChar* opname, IROp op )
+{
+   HChar   dis_buf[50];
+   Int     alen, size;
+   IRTemp  addr;
+   Bool    shl, shr, sar;
+   UChar   rm   = getUChar(delta);
+   IRTemp  g0   = newTemp(Ity_V128);
+   IRTemp  g1   = newTemp(Ity_V128);
+   IRTemp  amt  = newTemp(Ity_I32);
+   IRTemp  amt8 = newTemp(Ity_I8);
+   if (epartIsReg(rm)) {
+      assign( amt, getXMMRegLane32(eregOfRexRM(pfx,rm), 0) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRexRM(pfx,rm)),
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      delta++;
+   } else {
+      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRexRM(pfx,rm)) );
+      delta += alen;
+   }
+   assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
+   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
+
+   shl = shr = sar = False;
+   size = 0;
+   switch (op) {
+      case Iop_ShlN16x8: shl = True; size = 32; break;
+      case Iop_ShlN32x4: shl = True; size = 32; break;
+      case Iop_ShlN64x2: shl = True; size = 64; break;
+      case Iop_SarN16x8: sar = True; size = 16; break;
+      case Iop_SarN32x4: sar = True; size = 32; break;
+      case Iop_ShrN16x8: shr = True; size = 16; break;
+      case Iop_ShrN32x4: shr = True; size = 32; break;
+      case Iop_ShrN64x2: shr = True; size = 64; break;
+      default: vassert(0);
+   }
+
+   if (shl || shr) {
+     assign( 
+        g1,
+        IRExpr_Mux0X(
+           unop(Iop_1Uto8,
+                binop(Iop_CmpLT64U, unop(Iop_32Uto64,mkexpr(amt)), mkU64(size))),
+           mkV128(0x0000),
+           binop(op, mkexpr(g0), mkexpr(amt8))
+        )
+     );
+   } else 
+   if (sar) {
+     assign( 
+        g1,
+        IRExpr_Mux0X(
+           unop(Iop_1Uto8,
+                binop(Iop_CmpLT64U, unop(Iop_32Uto64,mkexpr(amt)), mkU64(size))),
+           binop(op, mkexpr(g0), mkU8(size-1)),
+           binop(op, mkexpr(g0), mkexpr(amt8))
+        )
+     );
+   } else {
+      vassert(0);
+   }
+
+   putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
+   return delta;
+}
+
+
+/* Vector by scalar shift of E by an immediate byte. */
+
+static 
+ULong dis_SSE_shiftE_imm ( Prefix pfx, 
+                           Long delta, HChar* opname, IROp op )
+{
+   Bool    shl, shr, sar;
+   UChar   rm   = getUChar(delta);
+   IRTemp  e0   = newTemp(Ity_V128);
+   IRTemp  e1   = newTemp(Ity_V128);
+   UChar   amt, size;
+   vassert(epartIsReg(rm));
+   vassert(gregLO3ofRM(rm) == 2 
+           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
+   amt = getUChar(delta+1);
+   delta += 2;
+   DIP("%s $%d,%s\n", opname,
+                      (Int)amt,
+                      nameXMMReg(eregOfRexRM(pfx,rm)) );
+   assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
+
+   shl = shr = sar = False;
+   size = 0;
+   switch (op) {
+      case Iop_ShlN16x8: shl = True; size = 16; break;
+      case Iop_ShlN32x4: shl = True; size = 32; break;
+      case Iop_ShlN64x2: shl = True; size = 64; break;
+      case Iop_SarN16x8: sar = True; size = 16; break;
+      case Iop_SarN32x4: sar = True; size = 32; break;
+      case Iop_ShrN16x8: shr = True; size = 16; break;
+      case Iop_ShrN32x4: shr = True; size = 32; break;
+      case Iop_ShrN64x2: shr = True; size = 64; break;
+      default: vassert(0);
+   }
+
+   if (shl || shr) {
+     assign( e1, amt >= size 
+                    ? mkV128(0x0000)
+                    : binop(op, mkexpr(e0), mkU8(amt))
+     );
+   } else 
+   if (sar) {
+     assign( e1, amt >= size 
+                    ? binop(op, mkexpr(e0), mkU8(size-1))
+                    : binop(op, mkexpr(e0), mkU8(amt))
+     );
+   } else {
+      vassert(0);
+   }
+
+   putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
+   return delta;
+}
+
+
+/* Get the current SSE rounding mode. */
+
+static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
+{
+   return 
+      unop( Iop_64to32, 
+            binop( Iop_And64, 
+                   IRExpr_Get( OFFB_SSEROUND, Ity_I64 ), 
+                   mkU64(3) ));
+}
+
+static void put_sse_roundingmode ( IRExpr* sseround )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
+   stmt( IRStmt_Put( OFFB_SSEROUND, 
+                     unop(Iop_32Uto64,sseround) ) );
+}
+
+/* Break a 128-bit value up into four 32-bit ints. */
+
+static void breakup128to32s ( IRTemp t128,
+                              /*OUTs*/
+                              IRTemp* t3, IRTemp* t2,
+                              IRTemp* t1, IRTemp* t0 )
+{
+   IRTemp hi64 = newTemp(Ity_I64);
+   IRTemp lo64 = newTemp(Ity_I64);
+   assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
+   assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
+
+   vassert(t0 && *t0 == IRTemp_INVALID);
+   vassert(t1 && *t1 == IRTemp_INVALID);
+   vassert(t2 && *t2 == IRTemp_INVALID);
+   vassert(t3 && *t3 == IRTemp_INVALID);
+
+   *t0 = newTemp(Ity_I32);
+   *t1 = newTemp(Ity_I32);
+   *t2 = newTemp(Ity_I32);
+   *t3 = newTemp(Ity_I32);
+   assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
+   assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
+   assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
+   assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
+}
+
+/* Construct a 128-bit value from four 32-bit ints. */
+
+static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
+                              IRTemp t1, IRTemp t0 )
+{
+   return
+      binop( Iop_64HLtoV128,
+             binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
+             binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
+   );
+}
+
+/* Break a 64-bit value up into four 16-bit ints. */
+
+static void breakup64to16s ( IRTemp t64,
+                             /*OUTs*/
+                             IRTemp* t3, IRTemp* t2,
+                             IRTemp* t1, IRTemp* t0 )
+{
+   IRTemp hi32 = newTemp(Ity_I32);
+   IRTemp lo32 = newTemp(Ity_I32);
+   assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
+   assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
+
+   vassert(t0 && *t0 == IRTemp_INVALID);
+   vassert(t1 && *t1 == IRTemp_INVALID);
+   vassert(t2 && *t2 == IRTemp_INVALID);
+   vassert(t3 && *t3 == IRTemp_INVALID);
+
+   *t0 = newTemp(Ity_I16);
+   *t1 = newTemp(Ity_I16);
+   *t2 = newTemp(Ity_I16);
+   *t3 = newTemp(Ity_I16);
+   assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
+   assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
+   assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
+   assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
+}
+
+/* Construct a 64-bit value from four 16-bit ints. */
+
+static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
+                             IRTemp t1, IRTemp t0 )
+{
+   return
+      binop( Iop_32HLto64,
+             binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
+             binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
+   );
+}
+
+
+/* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
+   values (aa,bb), computes, for each of the 4 16-bit lanes:
+
+   (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
+*/
+static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
+{
+   IRTemp aa      = newTemp(Ity_I64);
+   IRTemp bb      = newTemp(Ity_I64);
+   IRTemp aahi32s = newTemp(Ity_I64);
+   IRTemp aalo32s = newTemp(Ity_I64);
+   IRTemp bbhi32s = newTemp(Ity_I64);
+   IRTemp bblo32s = newTemp(Ity_I64);
+   IRTemp rHi     = newTemp(Ity_I64);
+   IRTemp rLo     = newTemp(Ity_I64);
+   IRTemp one32x2 = newTemp(Ity_I64);
+   assign(aa, aax);
+   assign(bb, bbx);
+   assign( aahi32s,
+           binop(Iop_SarN32x2,
+                 binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
+                 mkU8(16) ));
+   assign( aalo32s,
+           binop(Iop_SarN32x2,
+                 binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
+                 mkU8(16) ));
+   assign( bbhi32s,
+           binop(Iop_SarN32x2,
+                 binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
+                 mkU8(16) ));
+   assign( bblo32s,
+           binop(Iop_SarN32x2,
+                 binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
+                 mkU8(16) ));
+   assign(one32x2, mkU64( (1ULL << 32) + 1 ));
+   assign(
+      rHi,
+      binop(
+         Iop_ShrN32x2,
+         binop(
+            Iop_Add32x2, 
+            binop(
+               Iop_ShrN32x2,
+               binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
+               mkU8(14)
+            ),
+            mkexpr(one32x2)
+         ),
+         mkU8(1)
+      )
+   );
+   assign(
+      rLo,
+      binop(
+         Iop_ShrN32x2,
+         binop(
+            Iop_Add32x2, 
+            binop(
+               Iop_ShrN32x2,
+               binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
+               mkU8(14)
+            ),
+            mkexpr(one32x2)
+         ),
+         mkU8(1)
+      )
+   );
+   return
+      binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
+}
+
+/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
+   values (aa,bb), computes, for each lane:
+
+          if aa_lane < 0 then - bb_lane
+     else if aa_lane > 0 then bb_lane
+     else 0
+*/
+static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
+{
+   IRTemp aa       = newTemp(Ity_I64);
+   IRTemp bb       = newTemp(Ity_I64);
+   IRTemp zero     = newTemp(Ity_I64);
+   IRTemp bbNeg    = newTemp(Ity_I64);
+   IRTemp negMask  = newTemp(Ity_I64);
+   IRTemp posMask  = newTemp(Ity_I64);
+   IROp   opSub    = Iop_INVALID;
+   IROp   opCmpGTS = Iop_INVALID;
+
+   switch (laneszB) {
+      case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
+      case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
+      case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
+      default: vassert(0);
+   }
+
+   assign( aa,      aax );
+   assign( bb,      bbx );
+   assign( zero,    mkU64(0) );
+   assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
+   assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
+   assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
+
+   return
+      binop(Iop_Or64,
+            binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
+            binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
+
+}
+
+/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
+   value aa, computes, for each lane
+
+   if aa < 0 then -aa else aa
+
+   Note that the result is interpreted as unsigned, so that the
+   absolute value of the most negative signed input can be
+   represented.
+*/
+static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
+{
+   IRTemp aa      = newTemp(Ity_I64);
+   IRTemp zero    = newTemp(Ity_I64);
+   IRTemp aaNeg   = newTemp(Ity_I64);
+   IRTemp negMask = newTemp(Ity_I64);
+   IRTemp posMask = newTemp(Ity_I64);
+   IROp   opSub   = Iop_INVALID;
+   IROp   opSarN  = Iop_INVALID;
+
+   switch (laneszB) {
+      case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
+      case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
+      case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
+      default: vassert(0);
+   }
+
+   assign( aa,      aax );
+   assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
+   assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
+   assign( zero,    mkU64(0) );
+   assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
+   return
+      binop(Iop_Or64,
+            binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
+            binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
+}
+
+static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
+                                        IRTemp lo64, Long byteShift )
+{
+   vassert(byteShift >= 1 && byteShift <= 7);
+   return
+      binop(Iop_Or64,
+            binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
+            binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
+      );
+}
+
+/* Generate a SIGSEGV followed by a restart of the current instruction
+   if effective_addr is not 16-aligned.  This is required behaviour
+   for some SSE3 instructions and all 128-bit SSSE3 instructions.
+   This assumes that guest_RIP_curr_instr is set correctly! */
+static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
+{
+   stmt(
+      IRStmt_Exit(
+         binop(Iop_CmpNE64,
+               binop(Iop_And64,mkexpr(effective_addr),mkU64(0xF)),
+               mkU64(0)),
+         Ijk_SigSEGV,
+         IRConst_U64(guest_RIP_curr_instr)
+      )
+   );
+}
+
+
+/* Helper for deciding whether a given insn (starting at the opcode
+   byte) may validly be used with a LOCK prefix.  The following insns
+   may be used with LOCK when their destination operand is in memory.
+   AFAICS this is exactly the same for both 32-bit and 64-bit mode.
+
+   ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
+   OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
+   ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
+   SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
+   AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
+   SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
+   XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
+
+   DEC        FE /1,  FF /1
+   INC        FE /0,  FF /0
+
+   NEG        F6 /3,  F7 /3
+   NOT        F6 /2,  F7 /2
+
+   XCHG       86, 87
+
+   BTC        0F BB,  0F BA /7
+   BTR        0F B3,  0F BA /6
+   BTS        0F AB,  0F BA /5
+
+   CMPXCHG    0F B0,  0F B1
+   CMPXCHG8B  0F C7 /1
+
+   XADD       0F C0,  0F C1
+
+   ------------------------------
+
+   80 /0  =  addb $imm8,  rm8
+   81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
+   82 /0  =  addb $imm8,  rm8
+   83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
+
+   00     =  addb r8,  rm8
+   01     =  addl r32, rm32  and  addw r16, rm16
+
+   Same for ADD OR ADC SBB AND SUB XOR
+
+   FE /1  = dec rm8
+   FF /1  = dec rm32  and  dec rm16
+
+   FE /0  = inc rm8
+   FF /0  = inc rm32  and  inc rm16
+
+   F6 /3  = neg rm8
+   F7 /3  = neg rm32  and  neg rm16
+
+   F6 /2  = not rm8
+   F7 /2  = not rm32  and  not rm16
+
+   0F BB     = btcw r16, rm16    and  btcl r32, rm32
+   OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
+
+   Same for BTS, BTR
+*/
+static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
+{
+   switch (opc[0]) {
+      case 0x00: case 0x01: case 0x08: case 0x09:
+      case 0x10: case 0x11: case 0x18: case 0x19:
+      case 0x20: case 0x21: case 0x28: case 0x29:
+      case 0x30: case 0x31:
+         if (!epartIsReg(opc[1]))
+            return True;
+         break;
+
+      case 0x80: case 0x81: case 0x82: case 0x83:
+         if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
+             && !epartIsReg(opc[1]))
+            return True;
+         break;
+
+      case 0xFE: case 0xFF:
+         if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
+             && !epartIsReg(opc[1]))
+            return True;
+         break;
+
+      case 0xF6: case 0xF7:
+         if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
+             && !epartIsReg(opc[1]))
+            return True;
+         break;
+
+      case 0x86: case 0x87:
+         if (!epartIsReg(opc[1]))
+            return True;
+         break;
+
+      case 0x0F: {
+         switch (opc[1]) {
+            case 0xBB: case 0xB3: case 0xAB:
+               if (!epartIsReg(opc[2]))
+                  return True;
+               break;
+            case 0xBA: 
+               if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
+                   && !epartIsReg(opc[2]))
+                  return True;
+               break;
+            case 0xB0: case 0xB1:
+               if (!epartIsReg(opc[2]))
+                  return True;
+               break;
+            case 0xC7: 
+               if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
+                  return True;
+               break;
+            case 0xC0: case 0xC1:
+               if (!epartIsReg(opc[2]))
+                  return True;
+               break;
+            default:
+               break;
+         } /* switch (opc[1]) */
+         break;
+      }
+
+      default:
+         break;
+   } /* switch (opc[0]) */
+
+   return False;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Disassemble a single instruction                     ---*/
+/*------------------------------------------------------------*/
+
+/* Disassemble a single instruction into IR.  The instruction is
+   located in host memory at &guest_code[delta]. */
+   
+static
+DisResult disInstr_AMD64_WRK ( 
+             /*OUT*/Bool* expect_CAS,
+             Bool         put_IP,
+             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
+             Bool         resteerCisOk,
+             void*        callback_opaque,
+             Long         delta64,
+             VexArchInfo* archinfo,
+             VexAbiInfo*  vbi
+          )
+{
+   IRType    ty;
+   IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
+   Int       alen;
+   UChar     opc, modrm, abyte, pre;
+   Long      d64;
+   HChar     dis_buf[50];
+   Int       am_sz, d_sz, n, n_prefixes;
+   DisResult dres;
+   UChar*    insn; /* used in SSE decoders */
+
+   /* The running delta */
+   Long delta = delta64;
+
+   /* Holds eip at the start of the insn, so that we can print
+      consistent error messages for unimplemented insns. */
+   Long delta_start = delta;
+
+   /* sz denotes the nominal data-op size of the insn; we change it to
+      2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
+      conflict REX.W takes precedence. */
+   Int sz = 4;
+
+   /* pfx holds the summary of prefixes. */
+   Prefix pfx = PFX_EMPTY;
+
+   /* Set result defaults. */
+   dres.whatNext   = Dis_Continue;
+   dres.len        = 0;
+   dres.continueAt = 0;
+
+   *expect_CAS = False;
+
+   vassert(guest_RIP_next_assumed == 0);
+   vassert(guest_RIP_next_mustcheck == False);
+
+   addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID; 
+
+   DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
+
+   /* We may be asked to update the guest RIP before going further. */
+   if (put_IP)
+      stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr)) );
+
+   /* Spot "Special" instructions (see comment at top of file). */
+   {
+      UChar* code = (UChar*)(guest_code + delta);
+      /* Spot the 16-byte preamble:
+         48C1C703   rolq $3,  %rdi
+         48C1C70D   rolq $13, %rdi
+         48C1C73D   rolq $61, %rdi
+         48C1C733   rolq $51, %rdi
+      */
+      if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7 
+                                               && code[ 3] == 0x03 &&
+          code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7 
+                                               && code[ 7] == 0x0D &&
+          code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7 
+                                               && code[11] == 0x3D &&
+          code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7 
+                                               && code[15] == 0x33) {
+         /* Got a "Special" instruction preamble.  Which one is it? */
+         if (code[16] == 0x48 && code[17] == 0x87 
+                              && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
+            /* %RDX = client_request ( %RAX ) */
+            DIP("%%rdx = client_request ( %%rax )\n");
+            delta += 19;
+            jmp_lit(Ijk_ClientReq, guest_RIP_bbstart+delta);
+            dres.whatNext = Dis_StopHere;
+            goto decode_success;
+         }
+         else
+         if (code[16] == 0x48 && code[17] == 0x87 
+                              && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
+            /* %RAX = guest_NRADDR */
+            DIP("%%rax = guest_NRADDR\n");
+            delta += 19;
+            putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
+            goto decode_success;
+         }
+         else
+         if (code[16] == 0x48 && code[17] == 0x87 
+                              && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
+            /* call-noredir *%RAX */
+            DIP("call-noredir *%%rax\n");
+            delta += 19;
+            t1 = newTemp(Ity_I64);
+            assign(t1, getIRegRAX(8));
+            t2 = newTemp(Ity_I64);
+            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
+            putIReg64(R_RSP, mkexpr(t2));
+            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
+            jmp_treg(Ijk_NoRedir,t1);
+            dres.whatNext = Dis_StopHere;
+            goto decode_success;
+         }
+         /* We don't know what it is. */
+         goto decode_failure;
+         /*NOTREACHED*/
+      }
+   }
+
+   /* Eat prefixes, summarising the result in pfx and sz, and rejecting
+      as many invalid combinations as possible. */
+   n_prefixes = 0;
+   while (True) {
+      if (n_prefixes > 7) goto decode_failure;
+      pre = getUChar(delta);
+      switch (pre) {
+         case 0x66: pfx |= PFX_66; break;
+         case 0x67: pfx |= PFX_ASO; break;
+         case 0xF2: pfx |= PFX_F2; break;
+         case 0xF3: pfx |= PFX_F3; break;
+         case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
+         case 0x2E: pfx |= PFX_CS; break;
+         case 0x3E: pfx |= PFX_DS; break;
+         case 0x26: pfx |= PFX_ES; break;
+         case 0x64: pfx |= PFX_FS; break;
+         case 0x65: pfx |= PFX_GS; break;
+         case 0x36: pfx |= PFX_SS; break;
+         case 0x40 ... 0x4F:
+            pfx |= PFX_REX;
+            if (pre & (1<<3)) pfx |= PFX_REXW;
+            if (pre & (1<<2)) pfx |= PFX_REXR;
+            if (pre & (1<<1)) pfx |= PFX_REXX;
+            if (pre & (1<<0)) pfx |= PFX_REXB;
+            break;
+         default: 
+            goto not_a_prefix;
+      }
+      n_prefixes++;
+      delta++;
+   }
+
+   not_a_prefix:
+
+   /* Dump invalid combinations */
+   n = 0;
+   if (pfx & PFX_F2) n++;
+   if (pfx & PFX_F3) n++;
+   if (n > 1) 
+      goto decode_failure; /* can't have both */
+
+   n = 0;
+   if (pfx & PFX_CS) n++;
+   if (pfx & PFX_DS) n++;
+   if (pfx & PFX_ES) n++;
+   if (pfx & PFX_FS) n++;
+   if (pfx & PFX_GS) n++;
+   if (pfx & PFX_SS) n++;
+   if (n > 1) 
+      goto decode_failure; /* multiple seg overrides == illegal */
+
+   /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
+      that we should accept it. */
+   if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_zero)
+      goto decode_failure;
+
+   /* Ditto for %gs prefixes. */
+   if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_0x60)
+      goto decode_failure;
+
+   /* Set up sz. */
+   sz = 4;
+   if (pfx & PFX_66) sz = 2;
+   if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
+
+   /* Now we should be looking at the primary opcode byte or the
+      leading F2 or F3.  Check that any LOCK prefix is actually
+      allowed. */
+
+   if (pfx & PFX_LOCK) {
+      if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
+         DIP("lock ");
+      } else {
+         *expect_CAS = False;
+         goto decode_failure;
+      }
+   }
+
+
+   /* ---------------------------------------------------- */
+   /* --- The SSE/SSE2 decoder.                        --- */
+   /* ---------------------------------------------------- */
+
+   /* What did I do to deserve SSE ?  Perhaps I was really bad in a
+      previous life? */
+
+   /* Note, this doesn't handle SSE3 right now.  All amd64s support
+      SSE2 as a minimum so there is no point distinguishing SSE1 vs
+      SSE2. */
+
+   insn = (UChar*)&guest_code[delta];
+
+   /* FXSAVE is spuriously at the start here only because it is
+      thusly placed in guest-x86/toIR.c. */
+
+   /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory.
+      Note that REX.W 0F AE /0 writes a slightly different format and
+      we don't handle that here. */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xAE
+       && !epartIsReg(insn[2]) && gregOfRexRM(pfx,insn[2]) == 0) {
+       IRDirty* d;
+      modrm = getUChar(delta+2);
+      vassert(sz == 4);
+      vassert(!epartIsReg(modrm));
+      /* REX.W must not be set.  That should be assured us by sz == 4
+         above. */
+      vassert(!(pfx & PFX_REXW));
+
+      addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+      delta += 2+alen;
+
+      DIP("fxsave %s\n", dis_buf);
+
+      /* Uses dirty helper: 
+            void amd64g_do_FXSAVE ( VexGuestAMD64State*, UInt ) */
+      d = unsafeIRDirty_0_N ( 
+             0/*regparms*/, 
+             "amd64g_dirtyhelper_FXSAVE", 
+             &amd64g_dirtyhelper_FXSAVE,
+             mkIRExprVec_1( mkexpr(addr) )
+          );
+      d->needsBBP = True;
+
+      /* declare we're writing memory */
+      d->mFx   = Ifx_Write;
+      d->mAddr = mkexpr(addr);
+      d->mSize = 512;
+
+      /* declare we're reading guest state */
+      d->nFxState = 7;
+
+      d->fxState[0].fx     = Ifx_Read;
+      d->fxState[0].offset = OFFB_FTOP;
+      d->fxState[0].size   = sizeof(UInt);
+
+      d->fxState[1].fx     = Ifx_Read;
+      d->fxState[1].offset = OFFB_FPREGS;
+      d->fxState[1].size   = 8 * sizeof(ULong);
+
+      d->fxState[2].fx     = Ifx_Read;
+      d->fxState[2].offset = OFFB_FPTAGS;
+      d->fxState[2].size   = 8 * sizeof(UChar);
+
+      d->fxState[3].fx     = Ifx_Read;
+      d->fxState[3].offset = OFFB_FPROUND;
+      d->fxState[3].size   = sizeof(ULong);
+
+      d->fxState[4].fx     = Ifx_Read;
+      d->fxState[4].offset = OFFB_FC3210;
+      d->fxState[4].size   = sizeof(ULong);
+
+      d->fxState[5].fx     = Ifx_Read;
+      d->fxState[5].offset = OFFB_XMM0;
+      d->fxState[5].size   = 16 * sizeof(U128);
+
+      d->fxState[6].fx     = Ifx_Read;
+      d->fxState[6].offset = OFFB_SSEROUND;
+      d->fxState[6].size   = sizeof(ULong);
+
+      /* Be paranoid ... this assertion tries to ensure the 16 %xmm
+	 images are packed back-to-back.  If not, the value of
+	 d->fxState[5].size is wrong. */
+      vassert(16 == sizeof(U128));
+      vassert(OFFB_XMM15 == (OFFB_XMM0 + 15 * 16));
+
+      stmt( IRStmt_Dirty(d) );
+
+      goto decode_success;
+   }
+
+   /* ------ SSE decoder main ------ */
+
+   /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x58) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "addps", Iop_Add32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x58) {
+      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "addss", Iop_Add32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F 55 = ANDNPS -- G = (not G) and E */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x55) {
+      delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "andnps", Iop_AndV128 );
+      goto decode_success;
+   }
+
+   /* 0F 54 = ANDPS -- G = G and E */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x54) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "andps", Iop_AndV128 );
+      goto decode_success;
+   }
+
+   /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xC2) {
+      delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmpps", True, 4 );
+      goto decode_success;
+   }
+
+   /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0xC2) {
+      delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmpss", False, 4 );
+      goto decode_success;
+   }
+
+   /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
+   /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
+      IRTemp argL = newTemp(Ity_F32);
+      IRTemp argR = newTemp(Ity_F32);
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm), 
+                                         0/*lowest lane*/ ) );
+         delta += 2+1;
+         DIP("%scomiss %s,%s\n", insn[1]==0x2E ? "u" : "",
+                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("%scomiss %s,%s\n", insn[1]==0x2E ? "u" : "",
+                                 dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
+      }
+      assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm), 
+                                      0/*lowest lane*/ ) );
+
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+      stmt( IRStmt_Put( 
+               OFFB_CC_DEP1,
+               binop( Iop_And64,
+                      unop( Iop_32Uto64,
+                            binop(Iop_CmpF64, 
+                                  unop(Iop_F32toF64,mkexpr(argL)),
+                                  unop(Iop_F32toF64,mkexpr(argR)))),
+                      mkU64(0x45)
+          )));
+
+      goto decode_success;
+   }
+
+   /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
+      half xmm */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x2A) {
+      IRTemp arg64 = newTemp(Ity_I64);
+      IRTemp rmode = newTemp(Ity_I32);
+
+      modrm = getUChar(delta+2);
+      do_MMX_preamble();
+      if (epartIsReg(modrm)) {
+         assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 2+1;
+         DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvtpi2ps %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
+      }
+
+      assign( rmode, get_sse_roundingmode() );
+
+      putXMMRegLane32F( 
+         gregOfRexRM(pfx,modrm), 0,
+         binop(Iop_F64toF32, 
+               mkexpr(rmode),
+               unop(Iop_I32StoF64, 
+                    unop(Iop_64to32, mkexpr(arg64)) )) );
+
+      putXMMRegLane32F(
+         gregOfRexRM(pfx,modrm), 1, 
+         binop(Iop_F64toF32, 
+               mkexpr(rmode),
+               unop(Iop_I32StoF64,
+                    unop(Iop_64HIto32, mkexpr(arg64)) )) );
+
+      goto decode_success;
+   }
+
+   /* F3 0F 2A = CVTSI2SS 
+      -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
+      -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
+   if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x2A) {
+
+      IRTemp rmode = newTemp(Ity_I32);
+      assign( rmode, get_sse_roundingmode() );
+      modrm = getUChar(delta+2);
+
+      if (sz == 4) {
+         IRTemp arg32 = newTemp(Ity_I32);
+         if (epartIsReg(modrm)) {
+            assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
+            delta += 2+1;
+            DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+            assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
+            delta += 2+alen;
+            DIP("cvtsi2ss %s,%s\n", dis_buf,
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)) );
+         }
+         putXMMRegLane32F( 
+            gregOfRexRM(pfx,modrm), 0,
+            binop(Iop_F64toF32,
+                  mkexpr(rmode),
+                  unop(Iop_I32StoF64, mkexpr(arg32)) ) );
+      } else {
+         /* sz == 8 */
+         IRTemp arg64 = newTemp(Ity_I64);
+         if (epartIsReg(modrm)) {
+            assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
+            delta += 2+1;
+            DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
+                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
+            delta += 2+alen;
+            DIP("cvtsi2ssq %s,%s\n", dis_buf,
+                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
+         }
+         putXMMRegLane32F( 
+            gregOfRexRM(pfx,modrm), 0,
+            binop(Iop_F64toF32,
+                  mkexpr(rmode),
+                  binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
+      }
+
+      goto decode_success;
+   }
+
+   /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
+      I32 in mmx, according to prevailing SSE rounding mode */
+   /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
+      I32 in mmx, rounding towards zero */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
+      IRTemp dst64  = newTemp(Ity_I64);
+      IRTemp rmode  = newTemp(Ity_I32);
+      IRTemp f32lo  = newTemp(Ity_F32);
+      IRTemp f32hi  = newTemp(Ity_F32);
+      Bool   r2zero = toBool(insn[1] == 0x2C);
+
+      do_MMX_preamble();
+      modrm = getUChar(delta+2);
+
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
+         assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
+         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
+                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                   nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
+         assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64, 
+                                              mkexpr(addr), 
+                                              mkU64(4) )));
+         delta += 2+alen;
+         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
+                                   dis_buf,
+                                   nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      if (r2zero) {
+         assign(rmode, mkU32((UInt)Irrm_ZERO) );
+      } else {
+         assign( rmode, get_sse_roundingmode() );
+      }
+
+      assign( 
+         dst64,
+         binop( Iop_32HLto64,
+                binop( Iop_F64toI32S, 
+                       mkexpr(rmode), 
+                       unop( Iop_F32toF64, mkexpr(f32hi) ) ),
+                binop( Iop_F64toI32S, 
+                       mkexpr(rmode), 
+                       unop( Iop_F32toF64, mkexpr(f32lo) ) )
+              )
+      );
+
+      putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
+      goto decode_success;
+   }
+
+   /* F3 0F 2D = CVTSS2SI 
+      when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg, 
+                    according to prevailing SSE rounding mode
+      when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg, 
+                    according to prevailing SSE rounding mode
+   */
+   /* F3 0F 2C = CVTTSS2SI 
+      when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg, 
+                    truncating towards zero
+      when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg, 
+                    truncating towards zero 
+   */
+   if (haveF3no66noF2(pfx) 
+       && insn[0] == 0x0F 
+       && (insn[1] == 0x2D || insn[1] == 0x2C)) {
+      IRTemp rmode  = newTemp(Ity_I32);
+      IRTemp f32lo  = newTemp(Ity_F32);
+      Bool   r2zero = toBool(insn[1] == 0x2C);
+      vassert(sz == 4 || sz == 8);
+
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
+         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
+                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                   nameIReg(sz, gregOfRexRM(pfx,modrm), False));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
+         delta += 2+alen;
+         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
+                                   dis_buf,
+                                   nameIReg(sz, gregOfRexRM(pfx,modrm), False));
+      }
+
+      if (r2zero) {
+         assign( rmode, mkU32((UInt)Irrm_ZERO) );
+      } else {
+         assign( rmode, get_sse_roundingmode() );
+      }
+
+      if (sz == 4) {
+         putIReg32( gregOfRexRM(pfx,modrm),
+                    binop( Iop_F64toI32S, 
+                           mkexpr(rmode), 
+                           unop(Iop_F32toF64, mkexpr(f32lo))) );
+      } else {
+         putIReg64( gregOfRexRM(pfx,modrm),
+                    binop( Iop_F64toI64S, 
+                           mkexpr(rmode), 
+                           unop(Iop_F32toF64, mkexpr(f32lo))) );
+      }
+
+      goto decode_success;
+   }
+
+   /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x5E) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "divps", Iop_Div32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x5E) {
+      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "divss", Iop_Div32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
+   if (insn[0] == 0x0F && insn[1] == 0xAE
+       && haveNo66noF2noF3(pfx)
+       && !epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 2) {
+
+      IRTemp t64 = newTemp(Ity_I64);
+      IRTemp ew = newTemp(Ity_I32);
+
+      vassert(sz == 4);
+      addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+      delta += 2+alen;
+      DIP("ldmxcsr %s\n", dis_buf);
+
+      /* The only thing we observe in %mxcsr is the rounding mode.
+         Therefore, pass the 32-bit value (SSE native-format control
+         word) to a clean helper, getting back a 64-bit value, the
+         lower half of which is the SSEROUND value to store, and the
+         upper half of which is the emulation-warning token which may
+         be generated.  
+      */
+      /* ULong amd64h_check_ldmxcsr ( ULong ); */
+      assign( t64, mkIRExprCCall(
+                      Ity_I64, 0/*regparms*/, 
+                      "amd64g_check_ldmxcsr",
+                      &amd64g_check_ldmxcsr, 
+                      mkIRExprVec_1( 
+                         unop(Iop_32Uto64,
+                              loadLE(Ity_I32, mkexpr(addr))
+                         )
+                      )
+                   )
+            );
+
+      put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
+      assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
+      put_emwarn( mkexpr(ew) );
+      /* Finally, if an emulation warning was reported, side-exit to
+         the next insn, reporting the warning, so that Valgrind's
+         dispatcher sees the warning. */
+      stmt( 
+         IRStmt_Exit(
+            binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
+            Ijk_EmWarn,
+            IRConst_U64(guest_RIP_bbstart+delta)
+         )
+      );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F F7 = MASKMOVQ -- 8x8 masked store */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xF7) {
+      Bool ok = False;
+      delta = dis_MMX( &ok, vbi, pfx, sz, delta+1 );
+      if (!ok)
+         goto decode_failure;
+      goto decode_success;
+   }
+
+   /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x5F) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "maxps", Iop_Max32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x5F) {
+      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "maxss", Iop_Max32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x5D) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "minps", Iop_Min32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x5D) {
+      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "minss", Iop_Min32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
+   /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
+   if (haveNo66noF2noF3(pfx) 
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         putXMMReg( gregOfRexRM(pfx,modrm), 
+                    getXMMReg( eregOfRexRM(pfx,modrm) ));
+         DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         if (insn[1] == 0x28/*movaps*/)
+            gen_SEGV_if_not_16_aligned( addr );
+         putXMMReg( gregOfRexRM(pfx,modrm), 
+                    loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("mov[ua]ps %s,%s\n", dis_buf,
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+      }
+      goto decode_success;
+   }
+
+   /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
+   /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
+   if (haveNo66noF2noF3(pfx)
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && (insn[1] == 0x29 || insn[1] == 0x11)) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         /* fall through; awaiting test case */
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         if (insn[1] == 0x29/*movaps*/)
+            gen_SEGV_if_not_16_aligned( addr );
+         storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
+         DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
+                                  dis_buf );
+         delta += 2+alen;
+         goto decode_success;
+      }
+   }
+
+   /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
+   /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
+   if (haveNo66noF2noF3(pfx)
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x16) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
+                          getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
+         DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), 
+                               nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         delta += 2+alen;
+         putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
+                          loadLE(Ity_I64, mkexpr(addr)) );
+         DIP("movhps %s,%s\n", dis_buf, 
+                               nameXMMReg( gregOfRexRM(pfx,modrm) ));
+      }
+      goto decode_success;
+   }
+
+   /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
+   if (haveNo66noF2noF3(pfx)
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x17) {
+      if (!epartIsReg(insn[2])) {
+         delta += 2;
+         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+         delta += alen;
+         storeLE( mkexpr(addr), 
+                  getXMMRegLane64( gregOfRexRM(pfx,insn[2]),
+                                   1/*upper lane*/ ) );
+         DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,insn[2]) ),
+                               dis_buf);
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
+   /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
+   if (haveNo66noF2noF3(pfx)
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x12) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         putXMMRegLane64( gregOfRexRM(pfx,modrm),  
+                          0/*lower lane*/,
+                          getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
+         DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)), 
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         delta += 2+alen;
+         putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
+                          loadLE(Ity_I64, mkexpr(addr)) );
+         DIP("movlps %s, %s\n", 
+             dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
+      }
+      goto decode_success;
+   }
+
+   /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
+   if (haveNo66noF2noF3(pfx)
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x13) {
+      if (!epartIsReg(insn[2])) {
+         delta += 2;
+         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+         delta += alen;
+         storeLE( mkexpr(addr), 
+                  getXMMRegLane64( gregOfRexRM(pfx,insn[2]), 
+                                   0/*lower lane*/ ) );
+         DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,insn[2]) ),
+                                dis_buf);
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
+      to 4 lowest bits of ireg(G) */
+   if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x50) {
+      /* sz == 8 is a kludge to handle insns with REX.W redundantly
+         set to 1, which has been known to happen:
+
+         4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
+
+         20071106: Intel docs say that REX.W isn't redundant: when
+         present, a 64-bit register is written; when not present, only
+         the 32-bit half is written.  However, testing on a Core2
+         machine suggests the entire 64 bit register is written
+         irrespective of the status of REX.W.  That could be because
+         of the default rule that says "if the lower half of a 32-bit
+         register is written, the upper half is zeroed".  By using
+         putIReg32 here we inadvertantly produce the same behaviour as
+         the Core2, for the same reason -- putIReg32 implements said
+         rule.
+
+         AMD docs give no indication that REX.W is even valid for this
+         insn. */
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         Int src;
+         t0 = newTemp(Ity_I32);
+         t1 = newTemp(Ity_I32);
+         t2 = newTemp(Ity_I32);
+         t3 = newTemp(Ity_I32);
+         delta += 2+1;
+         src = eregOfRexRM(pfx,modrm);
+         assign( t0, binop( Iop_And32,
+                            binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
+                            mkU32(1) ));
+         assign( t1, binop( Iop_And32,
+                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
+                            mkU32(2) ));
+         assign( t2, binop( Iop_And32,
+                            binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
+                            mkU32(4) ));
+         assign( t3, binop( Iop_And32,
+                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
+                            mkU32(8) ));
+         putIReg32( gregOfRexRM(pfx,modrm),
+                    binop(Iop_Or32,
+                          binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
+                          binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
+                         )
+                 );
+         DIP("movmskps %s,%s\n", nameXMMReg(src), 
+                                 nameIReg32(gregOfRexRM(pfx,modrm)));
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
+   /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
+   if ( ( (haveNo66noF2noF3(pfx) && sz == 4)
+          || (have66noF2noF3(pfx) && sz == 2) 
+        )
+        && insn[0] == 0x0F && insn[1] == 0x2B) {
+      modrm = getUChar(delta+2);
+      if (!epartIsReg(modrm)) {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
+         DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
+                                 dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
+      Intel manual does not say anything about the usual business of
+      the FP reg tags getting trashed whenever an MMX insn happens.
+      So we just leave them alone. 
+   */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xE7) {
+      modrm = getUChar(delta+2);
+      if (!epartIsReg(modrm)) {
+         /* do_MMX_preamble(); Intel docs don't specify this */
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
+         DIP("movntq %s,%s\n", dis_buf,
+                               nameMMXReg(gregLO3ofRM(modrm)));
+         delta += 2+alen;
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
+      (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
+   if (haveF3no66noF2(pfx) 
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x10) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
+                          getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
+         DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                              nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
+         putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
+                          loadLE(Ity_I32, mkexpr(addr)) );
+         DIP("movss %s,%s\n", dis_buf,
+                              nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+      }
+      goto decode_success;
+   }
+
+   /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
+      or lo 1/4 xmm). */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x11) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         /* fall through, we don't yet have a test case */
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         storeLE( mkexpr(addr),
+                  getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
+         DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
+                              dis_buf);
+         delta += 2+alen;
+         goto decode_success;
+      }
+   }
+
+   /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x59) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "mulps", Iop_Mul32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x59) {
+      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "mulss", Iop_Mul32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F 56 = ORPS -- G = G and E */
+   if (haveNo66noF2noF3(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x56) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "orps", Iop_OrV128 );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xE0) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                 vbi, pfx, delta+2, insn[1], "pavgb", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xE3) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                 vbi, pfx, delta+2, insn[1], "pavgw", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put 
+      zero-extend of it in ireg(G). */
+   if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0xC5) {
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         IRTemp sV = newTemp(Ity_I64);
+         t5 = newTemp(Ity_I16);
+         do_MMX_preamble();
+         assign(sV, getMMXReg(eregLO3ofRM(modrm)));
+         breakup64to16s( sV, &t3, &t2, &t1, &t0 );
+         switch (insn[3] & 3) {
+            case 0:  assign(t5, mkexpr(t0)); break;
+            case 1:  assign(t5, mkexpr(t1)); break;
+            case 2:  assign(t5, mkexpr(t2)); break;
+            case 3:  assign(t5, mkexpr(t3)); break;
+            default: vassert(0);
+         }
+         if (sz == 8)
+            putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
+         else
+            putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
+         DIP("pextrw $%d,%s,%s\n",
+             (Int)insn[3], nameMMXReg(eregLO3ofRM(modrm)),
+                           sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
+                                 : nameIReg32(gregOfRexRM(pfx,modrm))
+         );
+         delta += 4;
+         goto decode_success;
+      } 
+      /* else fall through */
+      /* note, for anyone filling in the mem case: this insn has one
+         byte after the amode and therefore you must pass 1 as the
+         last arg to disAMode */
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
+      put it into the specified lane of mmx(G). */
+   if (haveNo66noF2noF3(pfx)
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0xC4) {
+      /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
+         mmx reg.  t4 is the new lane value.  t5 is the original
+         mmx value. t6 is the new mmx value. */
+      Int lane;
+      t4 = newTemp(Ity_I16);
+      t5 = newTemp(Ity_I64);
+      t6 = newTemp(Ity_I64);
+      modrm = insn[2];
+      do_MMX_preamble();
+
+      assign(t5, getMMXReg(gregLO3ofRM(modrm)));
+      breakup64to16s( t5, &t3, &t2, &t1, &t0 );
+
+      if (epartIsReg(modrm)) {
+         assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
+         delta += 3+1;
+         lane = insn[3+1-1];
+         DIP("pinsrw $%d,%s,%s\n", (Int)lane, 
+                                   nameIReg16(eregOfRexRM(pfx,modrm)),
+                                   nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 1 );
+         delta += 3+alen;
+         lane = insn[3+alen-1];
+         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
+         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
+                                   dis_buf,
+                                   nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      switch (lane & 3) {
+         case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
+         case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
+         case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
+         case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
+         default: vassert(0);
+      }
+      putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F EE = PMAXSW -- 16x4 signed max */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xEE) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                 vbi, pfx, delta+2, insn[1], "pmaxsw", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F DE = PMAXUB -- 8x8 unsigned max */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xDE) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                 vbi, pfx, delta+2, insn[1], "pmaxub", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F EA = PMINSW -- 16x4 signed min */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xEA) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                 vbi, pfx, delta+2, insn[1], "pminsw", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F DA = PMINUB -- 8x8 unsigned min */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xDA) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                 vbi, pfx, delta+2, insn[1], "pminub", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
+      mmx(G), turn them into a byte, and put zero-extend of it in
+      ireg(G). */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xD7) {
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         do_MMX_preamble();
+         t0 = newTemp(Ity_I64);
+         t1 = newTemp(Ity_I64);
+         assign(t0, getMMXReg(eregLO3ofRM(modrm)));
+         assign(t1, mkIRExprCCall(
+                       Ity_I64, 0/*regparms*/, 
+                       "amd64g_calculate_mmx_pmovmskb",
+                       &amd64g_calculate_mmx_pmovmskb,
+                       mkIRExprVec_1(mkexpr(t0))));
+         putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_64to32,mkexpr(t1)));
+         DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+                                 nameIReg32(gregOfRexRM(pfx,modrm)));
+         delta += 3;
+         goto decode_success;
+      } 
+      /* else fall through */
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xE4) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                 vbi, pfx, delta+2, insn[1], "pmuluh", False );
+      goto decode_success;
+   }
+
+   /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
+   /* 0F 18 /1 = PREFETCH0   -- with various different hints */
+   /* 0F 18 /2 = PREFETCH1 */
+   /* 0F 18 /3 = PREFETCH2 */
+   if (insn[0] == 0x0F && insn[1] == 0x18
+       && haveNo66noF2noF3(pfx)
+       && !epartIsReg(insn[2]) 
+       && gregLO3ofRM(insn[2]) >= 0 && gregLO3ofRM(insn[2]) <= 3) {
+      HChar* hintstr = "??";
+
+      modrm = getUChar(delta+2);
+      vassert(!epartIsReg(modrm));
+
+      addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+      delta += 2+alen;
+
+      switch (gregLO3ofRM(modrm)) {
+         case 0: hintstr = "nta"; break;
+         case 1: hintstr = "t0"; break;
+         case 2: hintstr = "t1"; break;
+         case 3: hintstr = "t2"; break;
+         default: vassert(0);
+      }
+
+      DIP("prefetch%s %s\n", hintstr, dis_buf);
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xF6) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                 vbi, pfx, delta+2, insn[1], "psadbw", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x70) {
+      Int order;
+      IRTemp sV, dV, s3, s2, s1, s0;
+      s3 = s2 = s1 = s0 = IRTemp_INVALID;
+      sV = newTemp(Ity_I64);
+      dV = newTemp(Ity_I64);
+      do_MMX_preamble();
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         order = (Int)insn[3];
+         delta += 2+2;
+         DIP("pshufw $%d,%s,%s\n", order, 
+                                   nameMMXReg(eregLO3ofRM(modrm)),
+                                   nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
+                           1/*extra byte after amode*/ );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         order = (Int)insn[2+alen];
+         delta += 3+alen;
+         DIP("pshufw $%d,%s,%s\n", order, 
+                                   dis_buf,
+                                   nameMMXReg(gregLO3ofRM(modrm)));
+      }
+      breakup64to16s( sV, &s3, &s2, &s1, &s0 );
+#     define SEL(n) \
+                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
+      assign(dV,
+	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
+                          SEL((order>>2)&3), SEL((order>>0)&3) )
+      );
+      putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
+#     undef SEL
+      goto decode_success;
+   }
+
+   /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x53) {
+      delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2, 
+                                        "rcpps", Iop_Recip32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
+   if (haveF3no66noF2(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x53) {
+      delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2, 
+                                         "rcpss", Iop_Recip32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x52) {
+      delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2, 
+                                        "rsqrtps", Iop_RSqrt32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
+   if (haveF3no66noF2(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x52) {
+      delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2, 
+                                         "rsqrtss", Iop_RSqrt32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F AE /7 = SFENCE -- flush pending operations to memory */
+   if (haveNo66noF2noF3(pfx) 
+       && insn[0] == 0x0F && insn[1] == 0xAE
+       && epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 7
+       && sz == 4) {
+      delta += 3;
+      /* Insert a memory fence.  It's sometimes important that these
+         are carried through to the generated code. */
+      stmt( IRStmt_MBE(Imbe_Fence) );
+      DIP("sfence\n");
+      goto decode_success;
+   }
+
+   /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xC6) {
+      Int    select;
+      IRTemp sV, dV;
+      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
+      sV = newTemp(Ity_V128);
+      dV = newTemp(Ity_V128);
+      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
+      modrm = insn[2];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         select = (Int)insn[3];
+         delta += 2+2;
+         DIP("shufps $%d,%s,%s\n", select, 
+                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 
+                           1/*byte at end of insn*/ );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         select = (Int)insn[2+alen];
+         delta += 3+alen;
+         DIP("shufps $%d,%s,%s\n", select, 
+                                   dis_buf,
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
+      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+
+#     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
+#     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm), 
+         mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3), 
+                       SELD((select>>2)&3), SELD((select>>0)&3) )
+      );
+
+#     undef SELD
+#     undef SELS
+
+      goto decode_success;
+   }
+
+   /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x51) {
+      delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2, 
+                                        "sqrtps", Iop_Sqrt32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x51) {
+      delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2, 
+                                         "sqrtss", Iop_Sqrt32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
+   if (insn[0] == 0x0F && insn[1] == 0xAE
+       && haveNo66noF2noF3(pfx)
+       && !epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 3) {
+
+      vassert(sz == 4);
+      addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+      delta += 2+alen;
+
+      /* Fake up a native SSE mxcsr word.  The only thing it depends
+         on is SSEROUND[1:0], so call a clean helper to cook it up. 
+      */
+      /* ULong amd64h_create_mxcsr ( ULong sseround ) */
+      DIP("stmxcsr %s\n", dis_buf);
+      storeLE( 
+         mkexpr(addr), 
+         unop(Iop_64to32,      
+              mkIRExprCCall(
+                 Ity_I64, 0/*regp*/,
+                 "amd64g_create_mxcsr", &amd64g_create_mxcsr, 
+                 mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) ) 
+	      ) 
+	 )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x5C) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "subps", Iop_Sub32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x5C) {
+      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "subss", Iop_Sub32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
+   /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
+   /* These just appear to be special cases of SHUFPS */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
+      IRTemp sV, dV;
+      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
+      Bool hi = toBool(insn[1] == 0x15);
+      sV = newTemp(Ity_V128);
+      dV = newTemp(Ity_V128);
+      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
+      modrm = insn[2];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 2+1;
+         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
+                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
+                                  dis_buf,
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
+      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+
+      if (hi) {
+         putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( s3, d3, s2, d2 ) );
+      } else {
+         putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( s1, d1, s0, d0 ) );
+      }
+
+      goto decode_success;
+   }
+
+   /* 0F 57 = XORPS -- G = G and E */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x57) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "xorps", Iop_XorV128 );
+      goto decode_success;
+   }
+
+   /* ---------------------------------------------------- */
+   /* --- end of the SSE decoder.                      --- */
+   /* ---------------------------------------------------- */
+
+   /* ---------------------------------------------------- */
+   /* --- start of the SSE2 decoder.                   --- */
+   /* ---------------------------------------------------- */
+
+   /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x58) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "addpd", Iop_Add64Fx2 );
+      goto decode_success;
+   }
+ 
+   /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
+   if (haveF2no66noF3(pfx) 
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x58) {
+      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "addsd", Iop_Add64F0x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 55 = ANDNPD -- G = (not G) and E */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x55) {
+      delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "andnpd", Iop_AndV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F 54 = ANDPD -- G = G and E */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x54) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "andpd", Iop_AndV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xC2) {
+      delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmppd", True, 8 );
+      goto decode_success;
+   }
+
+   /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
+   if (haveF2no66noF3(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0xC2) {
+      delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmpsd", False, 8 );
+      goto decode_success;
+   }
+
+   /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
+   /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
+   if (have66noF2noF3(pfx) && sz == 2
+       && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
+      IRTemp argL = newTemp(Ity_F64);
+      IRTemp argR = newTemp(Ity_F64);
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm), 
+                                         0/*lowest lane*/ ) );
+         delta += 2+1;
+         DIP("%scomisd %s,%s\n", insn[1]==0x2E ? "u" : "",
+                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("%scomisd %s,%s\n", insn[1]==0x2E ? "u" : "",
+                                 dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
+      }
+      assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm), 
+                                      0/*lowest lane*/ ) );
+
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+      stmt( IRStmt_Put( 
+               OFFB_CC_DEP1,
+               binop( Iop_And64,
+                      unop( Iop_32Uto64, 
+                            binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
+                      mkU64(0x45)
+          )));
+
+      goto decode_success;
+   }
+
+   /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
+      F64 in xmm(G) */
+   if (haveF3no66noF2(pfx) && insn[0] == 0x0F && insn[1] == 0xE6) {
+      IRTemp arg64 = newTemp(Ity_I64);
+      if (sz != 4) goto decode_failure;
+
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( arg64, getXMMRegLane64(eregOfRexRM(pfx,modrm), 0) );
+         delta += 2+1;
+         DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvtdq2pd %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
+      }
+
+      putXMMRegLane64F( 
+         gregOfRexRM(pfx,modrm), 0,
+         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
+      );
+
+      putXMMRegLane64F(
+         gregOfRexRM(pfx,modrm), 1, 
+         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
+      );
+
+      goto decode_success;
+   }
+
+   /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
+      xmm(G) */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x5B) {
+      IRTemp argV  = newTemp(Ity_V128);
+      IRTemp rmode = newTemp(Ity_I32);
+
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 2+1;
+         DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvtdq2ps %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
+      }
+         
+      assign( rmode, get_sse_roundingmode() );
+      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
+
+#     define CVT(_t)  binop( Iop_F64toF32,                    \
+                             mkexpr(rmode),                   \
+                             unop(Iop_I32StoF64,mkexpr(_t)))
+      
+      putXMMRegLane32F( gregOfRexRM(pfx,modrm), 3, CVT(t3) );
+      putXMMRegLane32F( gregOfRexRM(pfx,modrm), 2, CVT(t2) );
+      putXMMRegLane32F( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
+      putXMMRegLane32F( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
+
+#     undef CVT
+
+      goto decode_success;
+   }
+
+   /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
+      lo half xmm(G), and zero upper half, rounding towards zero */
+   /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
+      lo half xmm(G), according to prevailing rounding mode, and zero
+      upper half */
+   if ( ( (haveF2no66noF3(pfx) && sz == 4)
+          || (have66noF2noF3(pfx) && sz == 2)
+        )
+        && insn[0] == 0x0F && insn[1] == 0xE6) {
+      IRTemp argV   = newTemp(Ity_V128);
+      IRTemp rmode  = newTemp(Ity_I32);
+      Bool   r2zero = toBool(sz == 2);
+
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 2+1;
+         DIP("cvt%spd2dq %s,%s\n", r2zero ? "t" : "",
+                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvt%spd2dq %s,%s\n", r2zero ? "t" : "",
+                                   dis_buf,
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)) );
+      }
+         
+      if (r2zero) {
+         assign(rmode, mkU32((UInt)Irrm_ZERO) );
+      } else {
+         assign( rmode, get_sse_roundingmode() );
+      }
+
+      t0 = newTemp(Ity_F64);
+      t1 = newTemp(Ity_F64);
+      assign( t0, unop(Iop_ReinterpI64asF64, 
+                       unop(Iop_V128to64, mkexpr(argV))) );
+      assign( t1, unop(Iop_ReinterpI64asF64, 
+                       unop(Iop_V128HIto64, mkexpr(argV))) );
+      
+#     define CVT(_t)  binop( Iop_F64toI32S,                   \
+                             mkexpr(rmode),                   \
+                             mkexpr(_t) )
+      
+      putXMMRegLane32( gregOfRexRM(pfx,modrm), 3, mkU32(0) );
+      putXMMRegLane32( gregOfRexRM(pfx,modrm), 2, mkU32(0) );
+      putXMMRegLane32( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
+      putXMMRegLane32( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
+
+#     undef CVT
+
+      goto decode_success;
+   }
+
+   /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
+      I32 in mmx, according to prevailing SSE rounding mode */
+   /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
+      I32 in mmx, rounding towards zero */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
+      IRTemp dst64  = newTemp(Ity_I64);
+      IRTemp rmode  = newTemp(Ity_I32);
+      IRTemp f64lo  = newTemp(Ity_F64);
+      IRTemp f64hi  = newTemp(Ity_F64);
+      Bool   r2zero = toBool(insn[1] == 0x2C);
+
+      do_MMX_preamble();
+      modrm = getUChar(delta+2);
+
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
+         assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
+         DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
+                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                   nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
+         assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64, 
+                                              mkexpr(addr), 
+                                              mkU64(8) )));
+         delta += 2+alen;
+         DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
+                                   dis_buf,
+                                   nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      if (r2zero) {
+         assign(rmode, mkU32((UInt)Irrm_ZERO) );
+      } else {
+         assign( rmode, get_sse_roundingmode() );
+      }
+
+      assign( 
+         dst64,
+         binop( Iop_32HLto64,
+                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
+                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
+              )
+      );
+
+      putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
+      goto decode_success;
+   }
+
+   /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
+      lo half xmm(G), rounding according to prevailing SSE rounding
+      mode, and zero upper half */
+   /* Note, this is practically identical to CVTPD2DQ.  It would have
+      been nicer to merge them together, but the insn[] offsets differ
+      by one. */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x5A) {
+      IRTemp argV  = newTemp(Ity_V128);
+      IRTemp rmode = newTemp(Ity_I32);
+
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 2+1;
+         DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvtpd2ps %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
+      }
+         
+      assign( rmode, get_sse_roundingmode() );
+      t0 = newTemp(Ity_F64);
+      t1 = newTemp(Ity_F64);
+      assign( t0, unop(Iop_ReinterpI64asF64, 
+                       unop(Iop_V128to64, mkexpr(argV))) );
+      assign( t1, unop(Iop_ReinterpI64asF64, 
+                       unop(Iop_V128HIto64, mkexpr(argV))) );
+      
+#     define CVT(_t)  binop( Iop_F64toF32,                    \
+                             mkexpr(rmode),                   \
+                             mkexpr(_t) )
+      
+      putXMMRegLane32(  gregOfRexRM(pfx,modrm), 3, mkU32(0) );
+      putXMMRegLane32(  gregOfRexRM(pfx,modrm), 2, mkU32(0) );
+      putXMMRegLane32F( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
+      putXMMRegLane32F( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
+
+#     undef CVT
+
+      goto decode_success;
+   }
+
+   /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
+      xmm(G) */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x2A) {
+      IRTemp arg64 = newTemp(Ity_I64);
+
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         /* Only switch to MMX mode if the source is a MMX register.
+            This is inconsistent with all other instructions which
+            convert between XMM and (M64 or MMX), which always switch
+            to MMX mode even if 64-bit operand is M64 and not MMX.  At
+            least, that's what the Intel docs seem to me to say.
+            Fixes #210264. */
+         do_MMX_preamble();
+         assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 2+1;
+         DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvtpi2pd %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
+      }
+
+      putXMMRegLane64F( 
+         gregOfRexRM(pfx,modrm), 0,
+         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
+      );
+
+      putXMMRegLane64F( 
+         gregOfRexRM(pfx,modrm), 1,
+         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
+      );
+
+      goto decode_success;
+   }
+
+   /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
+      xmm(G), rounding towards zero */
+   /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
+      xmm(G), as per the prevailing rounding mode */
+   if ( ( (have66noF2noF3(pfx) && sz == 2)
+          || (haveF3no66noF2(pfx) && sz == 4)
+        )
+        && insn[0] == 0x0F && insn[1] == 0x5B) {
+      IRTemp argV   = newTemp(Ity_V128);
+      IRTemp rmode  = newTemp(Ity_I32);
+      Bool   r2zero = toBool(sz == 4);
+
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 2+1;
+         DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvtps2dq %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
+      }
+         
+      if (r2zero) {
+         assign( rmode, mkU32((UInt)Irrm_ZERO) );
+      } else {
+         assign( rmode, get_sse_roundingmode() );
+      }
+
+      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
+
+      /* This is less than ideal.  If it turns out to be a performance
+         bottleneck it can be improved. */
+#     define CVT(_t)                             \
+         binop( Iop_F64toI32S,                   \
+                mkexpr(rmode),                   \
+                unop( Iop_F32toF64,              \
+                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
+      
+      putXMMRegLane32( gregOfRexRM(pfx,modrm), 3, CVT(t3) );
+      putXMMRegLane32( gregOfRexRM(pfx,modrm), 2, CVT(t2) );
+      putXMMRegLane32( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
+      putXMMRegLane32( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
+
+#     undef CVT
+
+      goto decode_success;
+   }
+
+   /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
+      F64 in xmm(G). */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x5A) {
+      IRTemp f32lo = newTemp(Ity_F32);
+      IRTemp f32hi = newTemp(Ity_F32);
+
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0) );
+         assign( f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1) );
+         delta += 2+1;
+         DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
+	 assign( f32hi, loadLE(Ity_F32, 
+                               binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
+         delta += 2+alen;
+         DIP("cvtps2pd %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
+      }
+
+      putXMMRegLane64F( gregOfRexRM(pfx,modrm), 1,
+                        unop(Iop_F32toF64, mkexpr(f32hi)) );
+      putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
+                        unop(Iop_F32toF64, mkexpr(f32lo)) );
+
+      goto decode_success;
+   }
+
+   /* F2 0F 2D = CVTSD2SI 
+      when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg, 
+                    according to prevailing SSE rounding mode
+      when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg, 
+                    according to prevailing SSE rounding mode
+   */
+   /* F2 0F 2C = CVTTSD2SI 
+      when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg, 
+                    truncating towards zero
+      when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg, 
+                    truncating towards zero 
+   */
+   if (haveF2no66noF3(pfx) 
+       && insn[0] == 0x0F 
+       && (insn[1] == 0x2D || insn[1] == 0x2C)) {
+      IRTemp rmode  = newTemp(Ity_I32);
+      IRTemp f64lo  = newTemp(Ity_F64);
+      Bool   r2zero = toBool(insn[1] == 0x2C);
+      vassert(sz == 4 || sz == 8);
+
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
+         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
+                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                   nameIReg(sz, gregOfRexRM(pfx,modrm), False));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
+         delta += 2+alen;
+         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
+                                   dis_buf,
+                                   nameIReg(sz, gregOfRexRM(pfx,modrm), False));
+      }
+
+      if (r2zero) {
+         assign( rmode, mkU32((UInt)Irrm_ZERO) );
+      } else {
+         assign( rmode, get_sse_roundingmode() );
+      }
+
+      if (sz == 4) {
+         putIReg32( gregOfRexRM(pfx,modrm),
+                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
+      } else {
+         putIReg64( gregOfRexRM(pfx,modrm),
+                    binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
+      }
+
+      goto decode_success;
+   }
+
+   /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
+      low 1/4 xmm(G), according to prevailing SSE rounding mode */
+   if (haveF2no66noF3(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x5A) {
+      IRTemp rmode = newTemp(Ity_I32);
+      IRTemp f64lo = newTemp(Ity_F64);
+      vassert(sz == 4);
+
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
+         DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
+         delta += 2+alen;
+         DIP("cvtsd2ss %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( rmode, get_sse_roundingmode() );
+      putXMMRegLane32F( 
+         gregOfRexRM(pfx,modrm), 0, 
+         binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
+      );
+
+      goto decode_success;
+   }
+
+   /* F2 0F 2A = CVTSI2SD 
+      when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
+      when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
+   */
+   if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x2A) {
+      modrm = getUChar(delta+2);
+
+      if (sz == 4) {
+         IRTemp arg32 = newTemp(Ity_I32);
+         if (epartIsReg(modrm)) {
+            assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
+            delta += 2+1;
+            DIP("cvtsi2sd %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+            assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
+            delta += 2+alen;
+            DIP("cvtsi2sd %s,%s\n", dis_buf,
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)) );
+         }
+         putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
+                           unop(Iop_I32StoF64, mkexpr(arg32)) 
+         );
+      } else {
+         /* sz == 8 */
+         IRTemp arg64 = newTemp(Ity_I64);
+         if (epartIsReg(modrm)) {
+            assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
+            delta += 2+1;
+            DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
+                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
+            delta += 2+alen;
+            DIP("cvtsi2sdq %s,%s\n", dis_buf,
+                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
+         }
+         putXMMRegLane64F( 
+            gregOfRexRM(pfx,modrm), 
+            0,
+            binop( Iop_I64StoF64,
+                   get_sse_roundingmode(),
+                   mkexpr(arg64)
+            ) 
+         );
+
+      }
+
+      goto decode_success;
+   }
+
+   /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
+      low half xmm(G) */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x5A) {
+      IRTemp f32lo = newTemp(Ity_F32);
+
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
+         DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
+         delta += 2+alen;
+         DIP("cvtss2sd %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0, 
+                        unop( Iop_F32toF64, mkexpr(f32lo) ) );
+
+      goto decode_success;
+   }
+
+   /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x5E) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "divpd", Iop_Div64Fx2 );
+      goto decode_success;
+   }
+
+   /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
+   if (haveF2no66noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x5E) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "divsd", Iop_Div64F0x2 );
+      goto decode_success;
+   }
+
+   /* 0F AE /5 = LFENCE -- flush pending operations to memory */
+   /* 0F AE /6 = MFENCE -- flush pending operations to memory */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xAE
+       && epartIsReg(insn[2]) 
+       && (gregLO3ofRM(insn[2]) == 5 || gregLO3ofRM(insn[2]) == 6)) {
+      delta += 3;
+      /* Insert a memory fence.  It's sometimes important that these
+         are carried through to the generated code. */
+      stmt( IRStmt_MBE(Imbe_Fence) );
+      DIP("%sfence\n", gregLO3ofRM(insn[2])==5 ? "l" : "m");
+      goto decode_success;
+   }
+
+   /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x5F) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "maxpd", Iop_Max64Fx2 );
+      goto decode_success;
+   }
+
+   /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
+   if (haveF2no66noF3(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x5F) {
+      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "maxsd", Iop_Max64F0x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x5D) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "minpd", Iop_Min64Fx2 );
+      goto decode_success;
+   }
+
+   /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
+   if (haveF2no66noF3(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x5D) {
+      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "minsd", Iop_Min64F0x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
+   /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
+   /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F 
+       && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
+      HChar* wot = insn[1]==0x28 ? "apd" :
+                   insn[1]==0x10 ? "upd" : "dqa";
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         putXMMReg( gregOfRexRM(pfx,modrm), 
+                    getXMMReg( eregOfRexRM(pfx,modrm) ));
+         DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
+            gen_SEGV_if_not_16_aligned( addr );
+         putXMMReg( gregOfRexRM(pfx,modrm), 
+                    loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("mov%s %s,%s\n", wot, dis_buf,
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+      }
+      goto decode_success;
+   }
+
+   /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
+   /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
+   if (have66noF2noF3(pfx) && insn[0] == 0x0F
+       && (insn[1] == 0x29 || insn[1] == 0x11)) {
+      HChar* wot = insn[1]==0x29 ? "apd" : "upd";
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         putXMMReg( eregOfRexRM(pfx,modrm),
+		    getXMMReg( gregOfRexRM(pfx,modrm) ) );
+         DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRexRM(pfx,modrm)),
+	                           nameXMMReg(eregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         if (insn[1] == 0x29/*movapd*/)
+            gen_SEGV_if_not_16_aligned( addr );
+         storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
+         DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRexRM(pfx,modrm)),
+                              dis_buf );
+         delta += 2+alen;
+      }
+      goto decode_success;
+   }
+
+   /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4, zeroing high 3/4 of xmm. */
+   /*              or from ireg64/m64 to xmm lo 1/2, zeroing high 1/2 of xmm. */
+   if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x6E) {
+      vassert(sz == 2 || sz == 8);
+      if (sz == 2) sz = 4;
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         if (sz == 4) {
+            putXMMReg(
+               gregOfRexRM(pfx,modrm),
+               unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) ) 
+            );
+            DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)), 
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+         } else {
+            putXMMReg(
+               gregOfRexRM(pfx,modrm),
+               unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) ) 
+            );
+            DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)), 
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+	 }
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         delta += 2+alen;
+         putXMMReg(
+            gregOfRexRM(pfx,modrm),
+            sz == 4 
+               ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) ) 
+	       :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
+         );
+         DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf, 
+                               nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+      goto decode_success;
+   }
+
+   /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
+   /*              or from xmm low 1/2 to ireg64 or m64. */
+   if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x7E) {
+      if (sz == 2) sz = 4;
+      vassert(sz == 4 || sz == 8);
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         if (sz == 4) {
+            putIReg32( eregOfRexRM(pfx,modrm),
+                       getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
+            DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), 
+                                 nameIReg32(eregOfRexRM(pfx,modrm)));
+	 } else {
+            putIReg64( eregOfRexRM(pfx,modrm),
+                       getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
+            DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), 
+                                 nameIReg64(eregOfRexRM(pfx,modrm)));
+	 }
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         delta += 2+alen;
+         storeLE( mkexpr(addr),
+                  sz == 4
+                     ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
+                     : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
+         DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
+                               nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
+      }
+      goto decode_success;
+   }
+
+   /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x7F) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         putXMMReg( eregOfRexRM(pfx,modrm),
+                    getXMMReg(gregOfRexRM(pfx,modrm)) );
+         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), 
+                                nameXMMReg(eregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         delta += 2+alen;
+         storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
+         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
+      }
+      goto decode_success;
+   }
+
+   /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x6F) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         putXMMReg( gregOfRexRM(pfx,modrm), 
+                    getXMMReg( eregOfRexRM(pfx,modrm) ));
+         DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                               nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         putXMMReg( gregOfRexRM(pfx,modrm), 
+                    loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("movdqu %s,%s\n", dis_buf,
+                               nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+      }
+      goto decode_success;
+   }
+
+   /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x7F) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         goto decode_failure; /* awaiting test case */
+         delta += 2+1;
+         putXMMReg( eregOfRexRM(pfx,modrm),
+                    getXMMReg(gregOfRexRM(pfx,modrm)) );
+         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), 
+                                nameXMMReg(eregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         delta += 2+alen;
+         storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
+         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
+      }
+      goto decode_success;
+   }
+
+   /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
+   if (haveF2no66noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xD6) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         do_MMX_preamble();
+         putMMXReg( gregLO3ofRM(modrm), 
+                    getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
+         DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                nameMMXReg(gregLO3ofRM(modrm)));
+         delta += 2+1;
+         goto decode_success;
+      } else {
+         /* apparently no mem case for this insn */
+         goto decode_failure;
+      }
+   }
+
+   /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
+   /* These seems identical to MOVHPS.  This instruction encoding is
+      completely crazy. */
+   if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x16) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         /* fall through; apparently reg-reg is not possible */
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         delta += 2+alen;
+         putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
+                          loadLE(Ity_I64, mkexpr(addr)) );
+         DIP("movhpd %s,%s\n", dis_buf, 
+                               nameXMMReg( gregOfRexRM(pfx,modrm) ));
+         goto decode_success;
+      }
+   }
+
+   /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
+   /* Again, this seems identical to MOVHPS. */
+   if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x17) {
+      if (!epartIsReg(insn[2])) {
+         delta += 2;
+         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+         delta += alen;
+         storeLE( mkexpr(addr), 
+                  getXMMRegLane64( gregOfRexRM(pfx,insn[2]),
+                                   1/*upper lane*/ ) );
+         DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,insn[2]) ),
+                               dis_buf);
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
+   /* Identical to MOVLPS ? */
+   if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x12) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         /* fall through; apparently reg-reg is not possible */
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         delta += 2+alen;
+         putXMMRegLane64( gregOfRexRM(pfx,modrm),
+                          0/*lower lane*/,
+                          loadLE(Ity_I64, mkexpr(addr)) );
+         DIP("movlpd %s, %s\n", 
+             dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
+         goto decode_success;
+      }
+   }
+
+   /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
+   /* Identical to MOVLPS ? */
+   if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x13) {
+      modrm = getUChar(delta+2);
+      if (!epartIsReg(modrm)) {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         delta += 2+alen;
+         storeLE( mkexpr(addr), 
+                  getXMMRegLane64( gregOfRexRM(pfx,modrm), 
+                                   0/*lower lane*/ ) );
+         DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
+                                dis_buf);
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
+      2 lowest bits of ireg(G) */
+   if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x50) {
+      /* sz == 8 is a kludge to handle insns with REX.W redundantly
+         set to 1, which has been known to happen:
+         66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
+         20071106: see further comments on MOVMSKPS implementation above.
+      */
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         Int src;
+         t0 = newTemp(Ity_I32);
+         t1 = newTemp(Ity_I32);
+         delta += 2+1;
+         src = eregOfRexRM(pfx,modrm);
+         assign( t0, binop( Iop_And32,
+                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
+                            mkU32(1) ));
+         assign( t1, binop( Iop_And32,
+                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
+                            mkU32(2) ));
+         putIReg32( gregOfRexRM(pfx,modrm),
+                    binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
+                  );
+         DIP("movmskpd %s,%s\n", nameXMMReg(src), 
+                                 nameIReg32(gregOfRexRM(pfx,modrm)));
+         goto decode_success;
+      }
+      /* else fall through */
+      goto decode_failure;
+   }
+
+   /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
+   if (have66noF2noF3(pfx) && sz == 2
+       && insn[0] == 0x0F && insn[1] == 0xF7) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         IRTemp regD    = newTemp(Ity_V128);
+         IRTemp mask    = newTemp(Ity_V128);
+         IRTemp olddata = newTemp(Ity_V128);
+         IRTemp newdata = newTemp(Ity_V128);
+                addr    = newTemp(Ity_I64);
+
+         assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
+         assign( regD, getXMMReg( gregOfRexRM(pfx,modrm) ));
+
+         /* Unfortunately can't do the obvious thing with SarN8x16
+            here since that can't be re-emitted as SSE2 code - no such
+            insn. */
+	 assign( 
+            mask, 
+            binop(Iop_64HLtoV128,
+                  binop(Iop_SarN8x8, 
+                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ), 
+                        mkU8(7) ),
+                  binop(Iop_SarN8x8, 
+                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ), 
+                        mkU8(7) ) ));
+         assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
+         assign( newdata, 
+                 binop(Iop_OrV128, 
+                       binop(Iop_AndV128, 
+                             mkexpr(regD), 
+                             mkexpr(mask) ),
+                       binop(Iop_AndV128, 
+                             mkexpr(olddata),
+                             unop(Iop_NotV128, mkexpr(mask)))) );
+         storeLE( mkexpr(addr), mkexpr(newdata) );
+
+         delta += 2+1;
+         DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRexRM(pfx,modrm) ),
+                                   nameXMMReg( gregOfRexRM(pfx,modrm) ) );
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xE7) {
+      modrm = getUChar(delta+2);
+      if (!epartIsReg(modrm)) {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
+         DIP("movntdq %s,%s\n", dis_buf,
+                                nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+         goto decode_success;
+      }
+      /* else fall through */
+      goto decode_failure;
+   }
+
+   /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
+   if (haveNo66noF2noF3(pfx) &&
+       insn[0] == 0x0F && insn[1] == 0xC3) {
+      vassert(sz == 4 || sz == 8);
+      modrm = getUChar(delta+2);
+      if (!epartIsReg(modrm)) {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
+         DIP("movnti %s,%s\n", dis_buf,
+                               nameIRegG(sz, pfx, modrm));
+         delta += 2+alen;
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
+      or lo half xmm).  */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0xD6) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         /* fall through, awaiting test case */
+         /* dst: lo half copied, hi half zeroed */
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         storeLE( mkexpr(addr), 
+                  getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
+         DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
+         delta += 2+alen;
+         goto decode_success;
+      }
+   }
+
+   /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
+      hi half). */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && insn[1] == 0xD6) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         do_MMX_preamble();
+         putXMMReg( gregOfRexRM(pfx,modrm), 
+                    unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
+         DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+                                nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+         goto decode_success;
+      } else {
+         /* apparently no mem case for this insn */
+         goto decode_failure;
+      }
+   }
+
+   /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
+      G (lo half xmm).  Upper half of G is zeroed out. */
+   /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
+      G (lo half xmm).  If E is mem, upper half of G is zeroed out.
+      If E is reg, upper half of G is unchanged. */
+   if ( (haveF2no66noF3(pfx) 
+         && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+         && insn[0] == 0x0F && insn[1] == 0x10)
+        || 
+        (haveF3no66noF2(pfx) 
+         && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+         && insn[0] == 0x0F && insn[1] == 0x7E)
+      ) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
+                          getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
+         if (insn[1] == 0x7E/*MOVQ*/) {
+            /* zero bits 127:64 */
+            putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
+         }
+         DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                              nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
+         putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
+                          loadLE(Ity_I64, mkexpr(addr)) );
+         DIP("movsd %s,%s\n", dis_buf,
+                              nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+      }
+      goto decode_success;
+   }
+
+   /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
+      or lo half xmm). */
+   if (haveF2no66noF3(pfx) 
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x11) {
+      modrm = getUChar(delta+2);
+      if (epartIsReg(modrm)) {
+         putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
+                          getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
+         DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
+                              nameXMMReg(eregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         storeLE( mkexpr(addr),
+                  getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
+         DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
+                              dis_buf);
+         delta += 2+alen;
+      }
+      goto decode_success;
+   }
+
+   /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x59) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "mulpd", Iop_Mul64Fx2 );
+      goto decode_success;
+   }
+
+   /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
+   if (haveF2no66noF3(pfx) 
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x59) {
+      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "mulsd", Iop_Mul64F0x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 56 = ORPD -- G = G and E */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x56) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "orpd", Iop_OrV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xC6) {
+      Int    select;
+      IRTemp sV = newTemp(Ity_V128);
+      IRTemp dV = newTemp(Ity_V128);
+      IRTemp s1 = newTemp(Ity_I64);
+      IRTemp s0 = newTemp(Ity_I64);
+      IRTemp d1 = newTemp(Ity_I64);
+      IRTemp d0 = newTemp(Ity_I64);
+
+      modrm = insn[2];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         select = (Int)insn[3];
+         delta += 2+2;
+         DIP("shufpd $%d,%s,%s\n", select, 
+                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 1 );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         select = (Int)insn[2+alen];
+         delta += 3+alen;
+         DIP("shufpd $%d,%s,%s\n", select, 
+                                   dis_buf,
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
+
+#     define SELD(n) mkexpr((n)==0 ? d0 : d1)
+#     define SELS(n) mkexpr((n)==0 ? s0 : s1)
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm), 
+         binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
+      );
+
+#     undef SELD
+#     undef SELS
+
+      goto decode_success;
+   }
+
+   /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x51) {
+      delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2, 
+                                        "sqrtpd", Iop_Sqrt64Fx2 );
+      goto decode_success;
+   }
+
+   /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
+   if (haveF2no66noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x51) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta+2, 
+                                         "sqrtsd", Iop_Sqrt64F0x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x5C) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "subpd", Iop_Sub64Fx2 );
+      goto decode_success;
+   }
+
+   /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
+   if (haveF2no66noF3(pfx) 
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x5C) {
+      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "subsd", Iop_Sub64F0x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
+   /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
+   /* These just appear to be special cases of SHUFPS */
+   if (have66noF2noF3(pfx) 
+       && sz == 2 /* could be 8 if rex also present */
+       && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
+      IRTemp s1 = newTemp(Ity_I64);
+      IRTemp s0 = newTemp(Ity_I64);
+      IRTemp d1 = newTemp(Ity_I64);
+      IRTemp d0 = newTemp(Ity_I64);
+      IRTemp sV = newTemp(Ity_V128);
+      IRTemp dV = newTemp(Ity_V128);
+      Bool   hi = toBool(insn[1] == 0x15);
+
+      modrm = insn[2];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 2+1;
+         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
+                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
+                                  dis_buf,
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      if (hi) {
+         putXMMReg( gregOfRexRM(pfx,modrm), 
+                    binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
+      } else {
+         putXMMReg( gregOfRexRM(pfx,modrm), 
+                    binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
+      }
+
+      goto decode_success;
+   }
+
+   /* 66 0F 57 = XORPD -- G = G xor E */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x57) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "xorpd", Iop_XorV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F 6B = PACKSSDW */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x6B) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "packssdw", Iop_QNarrow32Sx4, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 63 = PACKSSWB */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x63) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "packsswb", Iop_QNarrow16Sx8, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 67 = PACKUSWB */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x67) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "packuswb", Iop_QNarrow16Ux8, True );
+      goto decode_success;
+   }
+
+   /* 66 0F FC = PADDB */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xFC) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "paddb", Iop_Add8x16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F FE = PADDD */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xFE) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "paddd", Iop_Add32x4, False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
+   /* 0F D4 = PADDQ -- add 64x1 */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xD4) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                vbi, pfx, delta+2, insn[1], "paddq", False );
+      goto decode_success;
+   }
+
+   /* 66 0F D4 = PADDQ */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xD4) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "paddq", Iop_Add64x2, False );
+      goto decode_success;
+   }
+
+   /* 66 0F FD = PADDW */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xFD) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "paddw", Iop_Add16x8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F EC = PADDSB */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xEC) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "paddsb", Iop_QAdd8Sx16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F ED = PADDSW */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xED) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "paddsw", Iop_QAdd16Sx8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F DC = PADDUSB */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xDC) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "paddusb", Iop_QAdd8Ux16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F DD = PADDUSW */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xDD) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "paddusw", Iop_QAdd16Ux8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F DB = PAND */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xDB) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "pand", Iop_AndV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F DF = PANDN */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xDF) {
+      delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "pandn", Iop_AndV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F E0 = PAVGB */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xE0) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pavgb", Iop_Avg8Ux16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F E3 = PAVGW */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xE3) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pavgw", Iop_Avg16Ux8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 74 = PCMPEQB */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x74) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pcmpeqb", Iop_CmpEQ8x16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 76 = PCMPEQD */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x76) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pcmpeqd", Iop_CmpEQ32x4, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 75 = PCMPEQW */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x75) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pcmpeqw", Iop_CmpEQ16x8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 64 = PCMPGTB */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x64) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pcmpgtb", Iop_CmpGT8Sx16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 66 = PCMPGTD */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x66) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pcmpgtd", Iop_CmpGT32Sx4, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 65 = PCMPGTW */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x65) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pcmpgtw", Iop_CmpGT16Sx8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put 
+      zero-extend of it in ireg(G). */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0xC5) {
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         t5 = newTemp(Ity_V128);
+         t4 = newTemp(Ity_I16);
+         assign(t5, getXMMReg(eregOfRexRM(pfx,modrm)));
+         breakup128to32s( t5, &t3, &t2, &t1, &t0 );
+         switch (insn[3] & 7) {
+            case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
+            case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
+            case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
+            case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
+            case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
+            case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
+            case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
+            case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
+            default: vassert(0);
+         }
+         putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t4)));
+         DIP("pextrw $%d,%s,%s\n",
+             (Int)insn[3], nameXMMReg(eregOfRexRM(pfx,modrm)),
+                           nameIReg32(gregOfRexRM(pfx,modrm)));
+         delta += 4;
+         goto decode_success;
+      } 
+      /* else fall through */
+      /* note, if memory case is ever filled in, there is 1 byte after
+         amode */
+   }
+
+   /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
+      put it into the specified lane of xmm(G). */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0xC4) {
+      Int lane;
+      t4 = newTemp(Ity_I16);
+      modrm = insn[2];
+
+      if (epartIsReg(modrm)) {
+         assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
+         delta += 3+1;
+         lane = insn[3+1-1];
+         DIP("pinsrw $%d,%s,%s\n", (Int)lane, 
+                                   nameIReg16(eregOfRexRM(pfx,modrm)),
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 
+                           1/*byte after the amode*/ );
+         delta += 3+alen;
+         lane = insn[3+alen-1];
+         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
+         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
+                                   dis_buf,
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+     }
+
+      putXMMRegLane16( gregOfRexRM(pfx,modrm), lane & 7, mkexpr(t4) );
+      goto decode_success;
+   }
+
+   /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
+      E(xmm or mem) to G(xmm) */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xF5) {
+      IRTemp s1V  = newTemp(Ity_V128);
+      IRTemp s2V  = newTemp(Ity_V128);
+      IRTemp dV   = newTemp(Ity_V128);
+      IRTemp s1Hi = newTemp(Ity_I64);
+      IRTemp s1Lo = newTemp(Ity_I64);
+      IRTemp s2Hi = newTemp(Ity_I64);
+      IRTemp s2Lo = newTemp(Ity_I64);
+      IRTemp dHi  = newTemp(Ity_I64);
+      IRTemp dLo  = newTemp(Ity_I64);
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( s1V, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 2+1;
+         DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("pmaddwd %s,%s\n", dis_buf,
+                                nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+      assign( s2V, getXMMReg(gregOfRexRM(pfx,modrm)) );
+      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
+      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
+      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
+      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
+      assign( dHi, mkIRExprCCall(
+                      Ity_I64, 0/*regparms*/,
+                      "amd64g_calculate_mmx_pmaddwd", 
+                      &amd64g_calculate_mmx_pmaddwd,
+                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
+                   ));
+      assign( dLo, mkIRExprCCall(
+                      Ity_I64, 0/*regparms*/,
+                      "amd64g_calculate_mmx_pmaddwd", 
+                      &amd64g_calculate_mmx_pmaddwd,
+                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
+                   ));
+      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
+      putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
+      goto decode_success;
+   }
+
+   /* 66 0F EE = PMAXSW -- 16x8 signed max */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xEE) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pmaxsw", Iop_Max16Sx8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xDE) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pmaxub", Iop_Max8Ux16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F EA = PMINSW -- 16x8 signed min */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xEA) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pminsw", Iop_Min16Sx8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F DA = PMINUB -- 8x16 unsigned min */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xDA) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pminub", Iop_Min8Ux16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in
+      xmm(E), turn them into a byte, and put zero-extend of it in
+      ireg(G).  Doing this directly is just too cumbersome; give up
+      therefore and call a helper. */
+   /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0xD7) {
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         t0 = newTemp(Ity_I64);
+         t1 = newTemp(Ity_I64);
+         assign(t0, getXMMRegLane64(eregOfRexRM(pfx,modrm), 0));
+         assign(t1, getXMMRegLane64(eregOfRexRM(pfx,modrm), 1));
+         t5 = newTemp(Ity_I64);
+         assign(t5, mkIRExprCCall(
+                       Ity_I64, 0/*regparms*/, 
+                       "amd64g_calculate_sse_pmovmskb",
+                       &amd64g_calculate_sse_pmovmskb,
+                       mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
+         putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_64to32,mkexpr(t5)));
+         DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameIReg32(gregOfRexRM(pfx,modrm)));
+         delta += 3;
+         goto decode_success;
+      } 
+      /* else fall through */
+   }
+
+   /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xE4) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pmulhuw", Iop_MulHi16Ux8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xE5) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pmulhw", Iop_MulHi16Sx8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F D5 = PMULHL -- 16x8 multiply */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xD5) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "pmullw", Iop_Mul16x8, False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
+   /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
+      0 to form 64-bit result */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xF4) {
+      IRTemp sV = newTemp(Ity_I64);
+      IRTemp dV = newTemp(Ity_I64);
+      t1 = newTemp(Ity_I32);
+      t0 = newTemp(Ity_I32);
+      modrm = insn[2];
+
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 2+1;
+         DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+                                nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("pmuludq %s,%s\n", dis_buf,
+                                nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      assign( t0, unop(Iop_64to32, mkexpr(dV)) );
+      assign( t1, unop(Iop_64to32, mkexpr(sV)) );
+      putMMXReg( gregLO3ofRM(modrm),
+                 binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
+      goto decode_success;
+   }
+
+   /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
+      0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
+      half */
+   /* This is a really poor translation -- could be improved if
+      performance critical */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xF4) {
+      IRTemp sV, dV;
+      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
+      sV = newTemp(Ity_V128);
+      dV = newTemp(Ity_V128);
+      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
+      t1 = newTemp(Ity_I64);
+      t0 = newTemp(Ity_I64);
+      modrm = insn[2];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 2+1;
+         DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("pmuludq %s,%s\n", dis_buf,
+                                nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
+      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+
+      assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
+      putXMMRegLane64( gregOfRexRM(pfx,modrm), 0, mkexpr(t0) );
+      assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
+      putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkexpr(t1) );
+      goto decode_success;
+   }
+
+   /* 66 0F EB = POR */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xEB) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "por", Iop_OrV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
+      from E(xmm or mem) to G(xmm) */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xF6) {
+      IRTemp s1V  = newTemp(Ity_V128);
+      IRTemp s2V  = newTemp(Ity_V128);
+      IRTemp dV   = newTemp(Ity_V128);
+      IRTemp s1Hi = newTemp(Ity_I64);
+      IRTemp s1Lo = newTemp(Ity_I64);
+      IRTemp s2Hi = newTemp(Ity_I64);
+      IRTemp s2Lo = newTemp(Ity_I64);
+      IRTemp dHi  = newTemp(Ity_I64);
+      IRTemp dLo  = newTemp(Ity_I64);
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( s1V, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 2+1;
+         DIP("psadbw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                               nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("psadbw %s,%s\n", dis_buf,
+                               nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+      assign( s2V, getXMMReg(gregOfRexRM(pfx,modrm)) );
+      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
+      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
+      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
+      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
+      assign( dHi, mkIRExprCCall(
+                      Ity_I64, 0/*regparms*/,
+                      "amd64g_calculate_mmx_psadbw", 
+                      &amd64g_calculate_mmx_psadbw,
+                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
+                   ));
+      assign( dLo, mkIRExprCCall(
+                      Ity_I64, 0/*regparms*/,
+                      "amd64g_calculate_mmx_psadbw", 
+                      &amd64g_calculate_mmx_psadbw,
+                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
+                   ));
+      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
+      putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
+      goto decode_success;
+   }
+
+   /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x70) {
+      Int order;
+      IRTemp sV, dV, s3, s2, s1, s0;
+      s3 = s2 = s1 = s0 = IRTemp_INVALID;
+      sV = newTemp(Ity_V128);
+      dV = newTemp(Ity_V128);
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         order = (Int)insn[3];
+         delta += 3+1;
+         DIP("pshufd $%d,%s,%s\n", order, 
+                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 
+                           1/*byte after the amode*/ );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+	 order = (Int)insn[2+alen];
+         delta += 2+alen+1;
+         DIP("pshufd $%d,%s,%s\n", order, 
+                                   dis_buf,
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+
+#     define SEL(n) \
+                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
+      assign(dV,
+	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
+                           SEL((order>>2)&3), SEL((order>>0)&3) )
+      );
+      putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
+#     undef SEL
+      goto decode_success;
+   }
+
+   /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
+      mem) to G(xmm), and copy lower half */
+   if (haveF3no66noF2(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x70) {
+      Int order;
+      IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
+      s3 = s2 = s1 = s0 = IRTemp_INVALID;
+      sV   = newTemp(Ity_V128);
+      dV   = newTemp(Ity_V128);
+      sVhi = newTemp(Ity_I64);
+      dVhi = newTemp(Ity_I64);
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         order = (Int)insn[3];
+         delta += 3+1;
+         DIP("pshufhw $%d,%s,%s\n", order, 
+                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 
+                           1/*byte after the amode*/ );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+	 order = (Int)insn[2+alen];
+         delta += 2+alen+1;
+         DIP("pshufhw $%d,%s,%s\n", order, 
+                                    dis_buf,
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+      assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
+
+#     define SEL(n) \
+                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
+      assign(dVhi,
+	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
+                          SEL((order>>2)&3), SEL((order>>0)&3) )
+      );
+      assign(dV, binop( Iop_64HLtoV128, 
+                        mkexpr(dVhi),
+                        unop(Iop_V128to64, mkexpr(sV))) );
+      putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
+#     undef SEL
+      goto decode_success;
+   }
+
+   /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
+      mem) to G(xmm), and copy upper half */
+   if (haveF2no66noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x70) {
+      Int order;
+      IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
+      s3 = s2 = s1 = s0 = IRTemp_INVALID;
+      sV   = newTemp(Ity_V128);
+      dV   = newTemp(Ity_V128);
+      sVlo = newTemp(Ity_I64);
+      dVlo = newTemp(Ity_I64);
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         order = (Int)insn[3];
+         delta += 3+1;
+         DIP("pshuflw $%d,%s,%s\n", order, 
+                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 
+                           1/*byte after the amode*/ );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+	 order = (Int)insn[2+alen];
+         delta += 2+alen+1;
+         DIP("pshuflw $%d,%s,%s\n", order, 
+                                    dis_buf,
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+      assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
+      breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
+
+#     define SEL(n) \
+                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
+      assign(dVlo,
+	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
+                          SEL((order>>2)&3), SEL((order>>0)&3) )
+      );
+      assign(dV, binop( Iop_64HLtoV128,
+                        unop(Iop_V128HIto64, mkexpr(sV)),
+                        mkexpr(dVlo) ) );
+      putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
+#     undef SEL
+      goto decode_success;
+   }
+
+   /* 66 0F 72 /6 ib = PSLLD by immediate */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x72
+       && epartIsReg(insn[2])
+       && gregLO3ofRM(insn[2]) == 6) {
+      delta = dis_SSE_shiftE_imm( pfx, delta+2, "pslld", Iop_ShlN32x4 );
+      goto decode_success;
+   }
+
+   /* 66 0F F2 = PSLLD by E */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xF2) {
+      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "pslld", Iop_ShlN32x4 );
+      goto decode_success;
+   }
+
+   /* 66 0F 73 /7 ib = PSLLDQ by immediate */
+   /* note, if mem case ever filled in, 1 byte after amode */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x73
+       && epartIsReg(insn[2])
+       && gregLO3ofRM(insn[2]) == 7) {
+      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
+      Int    imm = (Int)insn[3];
+      Int    reg = eregOfRexRM(pfx,insn[2]);
+      DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
+      vassert(imm >= 0 && imm <= 255);
+      delta += 4;
+
+      sV    = newTemp(Ity_V128);
+      dV    = newTemp(Ity_V128);
+      hi64  = newTemp(Ity_I64);
+      lo64  = newTemp(Ity_I64);
+      hi64r = newTemp(Ity_I64);
+      lo64r = newTemp(Ity_I64);
+
+      if (imm >= 16) {
+         putXMMReg(reg, mkV128(0x0000));
+         goto decode_success;
+      }
+
+      assign( sV, getXMMReg(reg) );
+      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
+
+      if (imm == 0) {
+         assign( lo64r, mkexpr(lo64) );
+         assign( hi64r, mkexpr(hi64) );
+      }
+      else
+      if (imm == 8) {
+         assign( lo64r, mkU64(0) );
+         assign( hi64r, mkexpr(lo64) );
+      }
+      else
+      if (imm > 8) {
+         assign( lo64r, mkU64(0) );
+         assign( hi64r, binop( Iop_Shl64, 
+                               mkexpr(lo64),
+                               mkU8( 8*(imm-8) ) ));
+      } else {
+         assign( lo64r, binop( Iop_Shl64, 
+                               mkexpr(lo64),
+                               mkU8(8 * imm) ));
+         assign( hi64r, 
+                 binop( Iop_Or64,
+                        binop(Iop_Shl64, mkexpr(hi64), 
+                                         mkU8(8 * imm)),
+                        binop(Iop_Shr64, mkexpr(lo64),
+                                         mkU8(8 * (8 - imm)) )
+                      )
+               );
+      }
+      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
+      putXMMReg(reg, mkexpr(dV));
+      goto decode_success;
+   }
+
+   /* 66 0F 73 /6 ib = PSLLQ by immediate */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x73
+       && epartIsReg(insn[2])
+       && gregLO3ofRM(insn[2]) == 6) {
+      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psllq", Iop_ShlN64x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F F3 = PSLLQ by E */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xF3) {
+      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psllq", Iop_ShlN64x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 71 /6 ib = PSLLW by immediate */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x71
+       && epartIsReg(insn[2])
+       && gregLO3ofRM(insn[2]) == 6) {
+      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psllw", Iop_ShlN16x8 );
+      goto decode_success;
+   }
+
+   /* 66 0F F1 = PSLLW by E */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xF1) {
+      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psllw", Iop_ShlN16x8 );
+      goto decode_success;
+   }
+
+   /* 66 0F 72 /4 ib = PSRAD by immediate */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x72
+       && epartIsReg(insn[2])
+       && gregLO3ofRM(insn[2]) == 4) {
+      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrad", Iop_SarN32x4 );
+      goto decode_success;
+   }
+
+   /* 66 0F E2 = PSRAD by E */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xE2) {
+      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrad", Iop_SarN32x4 );
+      goto decode_success;
+   }
+
+   /* 66 0F 71 /4 ib = PSRAW by immediate */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x71
+       && epartIsReg(insn[2])
+       && gregLO3ofRM(insn[2]) == 4) {
+      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psraw", Iop_SarN16x8 );
+      goto decode_success;
+   }
+
+   /* 66 0F E1 = PSRAW by E */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xE1) {
+      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psraw", Iop_SarN16x8 );
+      goto decode_success;
+   }
+
+   /* 66 0F 72 /2 ib = PSRLD by immediate */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x72
+       && epartIsReg(insn[2])
+       && gregLO3ofRM(insn[2]) == 2) {
+      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrld", Iop_ShrN32x4 );
+      goto decode_success;
+   }
+
+   /* 66 0F D2 = PSRLD by E */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xD2) {
+      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrld", Iop_ShrN32x4 );
+      goto decode_success;
+   }
+
+   /* 66 0F 73 /3 ib = PSRLDQ by immediate */
+   /* note, if mem case ever filled in, 1 byte after amode */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x73
+       && epartIsReg(insn[2])
+       && gregLO3ofRM(insn[2]) == 3) {
+      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
+      Int    imm = (Int)insn[3];
+      Int    reg = eregOfRexRM(pfx,insn[2]);
+      DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
+      vassert(imm >= 0 && imm <= 255);
+      delta += 4;
+
+      sV    = newTemp(Ity_V128);
+      dV    = newTemp(Ity_V128);
+      hi64  = newTemp(Ity_I64);
+      lo64  = newTemp(Ity_I64);
+      hi64r = newTemp(Ity_I64);
+      lo64r = newTemp(Ity_I64);
+
+      if (imm >= 16) {
+         putXMMReg(reg, mkV128(0x0000));
+         goto decode_success;
+      }
+
+      assign( sV, getXMMReg(reg) );
+      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
+
+      if (imm == 0) {
+         assign( lo64r, mkexpr(lo64) );
+         assign( hi64r, mkexpr(hi64) );
+      }
+      else
+      if (imm == 8) {
+         assign( hi64r, mkU64(0) );
+         assign( lo64r, mkexpr(hi64) );
+      }
+      else 
+      if (imm > 8) {
+         assign( hi64r, mkU64(0) );
+         assign( lo64r, binop( Iop_Shr64, 
+                               mkexpr(hi64),
+                               mkU8( 8*(imm-8) ) ));
+      } else {
+         assign( hi64r, binop( Iop_Shr64, 
+                               mkexpr(hi64),
+                               mkU8(8 * imm) ));
+         assign( lo64r, 
+                 binop( Iop_Or64,
+                        binop(Iop_Shr64, mkexpr(lo64), 
+                                         mkU8(8 * imm)),
+                        binop(Iop_Shl64, mkexpr(hi64),
+                                         mkU8(8 * (8 - imm)) )
+                      )
+               );
+      }
+
+      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
+      putXMMReg(reg, mkexpr(dV));
+      goto decode_success;
+   }
+
+   /* 66 0F 73 /2 ib = PSRLQ by immediate */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x73
+       && epartIsReg(insn[2])
+       && gregLO3ofRM(insn[2]) == 2) {
+      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrlq", Iop_ShrN64x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F D3 = PSRLQ by E */
+   if (have66noF2noF3(pfx) && sz == 2
+       && insn[0] == 0x0F && insn[1] == 0xD3) {
+      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrlq", Iop_ShrN64x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 71 /2 ib = PSRLW by immediate */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x71
+       && epartIsReg(insn[2])
+       && gregLO3ofRM(insn[2]) == 2) {
+      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrlw", Iop_ShrN16x8 );
+      goto decode_success;
+   }
+
+   /* 66 0F D1 = PSRLW by E */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xD1) {
+      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrlw", Iop_ShrN16x8 );
+      goto decode_success;
+   }
+
+   /* 66 0F F8 = PSUBB */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xF8) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "psubb", Iop_Sub8x16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F FA = PSUBD */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xFA) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "psubd", Iop_Sub32x4, False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
+   /* 0F FB = PSUBQ -- sub 64x1 */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xFB) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                vbi, pfx, delta+2, insn[1], "psubq", False );
+      goto decode_success;
+   }
+
+   /* 66 0F FB = PSUBQ */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xFB) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "psubq", Iop_Sub64x2, False );
+      goto decode_success;
+   }
+
+   /* 66 0F F9 = PSUBW */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xF9) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "psubw", Iop_Sub16x8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F E8 = PSUBSB */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xE8) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "psubsb", Iop_QSub8Sx16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F E9 = PSUBSW */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xE9) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "psubsw", Iop_QSub16Sx8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F D8 = PSUBSB */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xD8) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "psubusb", Iop_QSub8Ux16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F D9 = PSUBSW */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xD9) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "psubusw", Iop_QSub16Ux8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 68 = PUNPCKHBW */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x68) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "punpckhbw",
+                                 Iop_InterleaveHI8x16, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 6A = PUNPCKHDQ */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x6A) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "punpckhdq",
+                                 Iop_InterleaveHI32x4, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 6D = PUNPCKHQDQ */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x6D) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "punpckhqdq",
+                                 Iop_InterleaveHI64x2, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 69 = PUNPCKHWD */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x69) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "punpckhwd",
+                                 Iop_InterleaveHI16x8, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 60 = PUNPCKLBW */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x60) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "punpcklbw",
+                                 Iop_InterleaveLO8x16, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 62 = PUNPCKLDQ */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x62) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "punpckldq",
+                                 Iop_InterleaveLO32x4, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 6C = PUNPCKLQDQ */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x6C) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "punpcklqdq",
+                                 Iop_InterleaveLO64x2, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 61 = PUNPCKLWD */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x61) {
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2, 
+                                 "punpcklwd",
+                                 Iop_InterleaveLO16x8, True );
+      goto decode_success;
+   }
+
+   /* 66 0F EF = PXOR */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xEF) {
+      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "pxor", Iop_XorV128 );
+      goto decode_success;
+   }
+
+//.. //--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
+//.. //--    if (insn[0] == 0x0F && insn[1] == 0xAE 
+//.. //--        && (!epartIsReg(insn[2]))
+//.. //--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
+//.. //--       Bool store = gregOfRM(insn[2]) == 0;
+//.. //--       vg_assert(sz == 4);
+//.. //--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
+//.. //--       t1   = LOW24(pair);
+//.. //--       eip += 2+HI8(pair);
+//.. //--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
+//.. //--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
+//.. //--                   Lit16, (UShort)insn[2],
+//.. //--                   TempReg, t1 );
+//.. //--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
+//.. //--       goto decode_success;
+//.. //--    }
+
+   /* 0F AE /7 = CLFLUSH -- flush cache line */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xAE
+       && !epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 7) {
+
+      /* This is something of a hack.  We need to know the size of the
+         cache line containing addr.  Since we don't (easily), assume
+         256 on the basis that no real cache would have a line that
+         big.  It's safe to invalidate more stuff than we need, just
+         inefficient. */
+      ULong lineszB = 256ULL;
+
+      addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+      delta += 2+alen;
+
+      /* Round addr down to the start of the containing block. */
+      stmt( IRStmt_Put(
+               OFFB_TISTART,
+               binop( Iop_And64, 
+                      mkexpr(addr), 
+                      mkU64( ~(lineszB-1) ))) );
+
+      stmt( IRStmt_Put(OFFB_TILEN, mkU64(lineszB) ) );
+
+      irsb->jumpkind = Ijk_TInval;
+      irsb->next     = mkU64(guest_RIP_bbstart+delta);
+      dres.whatNext  = Dis_StopHere;
+
+      DIP("clflush %s\n", dis_buf);
+      goto decode_success;
+   }
+
+   /* ---------------------------------------------------- */
+   /* --- end of the SSE/SSE2 decoder.                 --- */
+   /* ---------------------------------------------------- */
+
+   /* ---------------------------------------------------- */
+   /* --- start of the SSE3 decoder.                   --- */
+   /* ---------------------------------------------------- */
+
+   /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
+      duplicating some lanes (2:2:0:0). */
+   /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
+      duplicating some lanes (3:3:1:1). */
+   if (haveF3no66noF2(pfx) && sz == 4
+       && insn[0] == 0x0F && (insn[1] == 0x12 || insn[1] == 0x16)) {
+      IRTemp s3, s2, s1, s0;
+      IRTemp sV  = newTemp(Ity_V128);
+      Bool   isH = insn[1] == 0x16;
+      s3 = s2 = s1 = s0 = IRTemp_INVALID;
+
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
+         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
+                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
+	     dis_buf,
+             nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+      }
+
+      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+      putXMMReg( gregOfRexRM(pfx,modrm), 
+                 isH ? mk128from32s( s3, s3, s1, s1 )
+                     : mk128from32s( s2, s2, s0, s0 ) );
+      goto decode_success;
+   }
+
+   /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
+      duplicating some lanes (0:1:0:1). */
+   if (haveF2no66noF3(pfx) 
+       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x12) {
+      IRTemp sV = newTemp(Ity_V128);
+      IRTemp d0 = newTemp(Ity_I64);
+
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
+         DIP("movddup %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+         assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
+         DIP("movddup %s,%s\n", dis_buf,
+                                nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+      }
+
+      putXMMReg( gregOfRexRM(pfx,modrm), 
+                 binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
+      goto decode_success;
+   }
+
+   /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
+   if (haveF2no66noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xD0) {
+      IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
+      IRTemp eV   = newTemp(Ity_V128);
+      IRTemp gV   = newTemp(Ity_V128);
+      IRTemp addV = newTemp(Ity_V128);
+      IRTemp subV = newTemp(Ity_V128);
+      a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
+
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
+         DIP("addsubps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("addsubps %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+      }
+
+      assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
+      assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
+
+      breakup128to32s( addV, &a3, &a2, &a1, &a0 );
+      breakup128to32s( subV, &s3, &s2, &s1, &s0 );
+
+      putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( a3, s2, a1, s0 ));
+      goto decode_success;
+   }
+
+   /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0xD0) {
+      IRTemp eV   = newTemp(Ity_V128);
+      IRTemp gV   = newTemp(Ity_V128);
+      IRTemp addV = newTemp(Ity_V128);
+      IRTemp subV = newTemp(Ity_V128);
+      IRTemp a1     = newTemp(Ity_I64);
+      IRTemp s0     = newTemp(Ity_I64);
+
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
+         DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("addsubpd %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+      }
+
+      assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
+      assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
+
+      assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
+      assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
+
+      putXMMReg( gregOfRexRM(pfx,modrm), 
+                 binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
+      goto decode_success;
+   }
+
+   /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
+   /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
+   if (haveF2no66noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
+      IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
+      IRTemp eV     = newTemp(Ity_V128);
+      IRTemp gV     = newTemp(Ity_V128);
+      IRTemp leftV  = newTemp(Ity_V128);
+      IRTemp rightV = newTemp(Ity_V128);
+      Bool   isAdd  = insn[1] == 0x7C;
+      HChar* str    = isAdd ? "add" : "sub";
+      e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
+
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
+         DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("h%sps %s,%s\n", str, dis_buf,
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+      }
+
+      assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      breakup128to32s( eV, &e3, &e2, &e1, &e0 );
+      breakup128to32s( gV, &g3, &g2, &g1, &g0 );
+
+      assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
+      assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
+
+      putXMMReg( gregOfRexRM(pfx,modrm), 
+                 binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4, 
+                       mkexpr(leftV), mkexpr(rightV) ) );
+      goto decode_success;
+   }
+
+   /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
+   /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
+   if (have66noF2noF3(pfx) && sz == 2 
+       && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
+      IRTemp e1     = newTemp(Ity_I64);
+      IRTemp e0     = newTemp(Ity_I64);
+      IRTemp g1     = newTemp(Ity_I64);
+      IRTemp g0     = newTemp(Ity_I64);
+      IRTemp eV     = newTemp(Ity_V128);
+      IRTemp gV     = newTemp(Ity_V128);
+      IRTemp leftV  = newTemp(Ity_V128);
+      IRTemp rightV = newTemp(Ity_V128);
+      Bool   isAdd  = insn[1] == 0x7C;
+      HChar* str    = isAdd ? "add" : "sub";
+
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
+         DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("h%spd %s,%s\n", str, dis_buf,
+                              nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+      }
+
+      assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
+      assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
+      assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
+      assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
+
+      assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
+      assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
+
+      putXMMReg( gregOfRexRM(pfx,modrm), 
+                 binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2, 
+                       mkexpr(leftV), mkexpr(rightV) ) );
+      goto decode_success;
+   }
+
+   /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
+   if (haveF2no66noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0xF0) {
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         goto decode_failure;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
+         putXMMReg( gregOfRexRM(pfx,modrm), 
+                    loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("lddqu %s,%s\n", dis_buf,
+                              nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 2+alen;
+      }
+      goto decode_success;
+   }
+
+   /* ---------------------------------------------------- */
+   /* --- end of the SSE3 decoder.                     --- */
+   /* ---------------------------------------------------- */
+
+   /* ---------------------------------------------------- */
+   /* --- start of the SSSE3 decoder.                  --- */
+   /* ---------------------------------------------------- */
+
+   /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
+      Unsigned Bytes (MMX) */
+   if (haveNo66noF2noF3(pfx) 
+       && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
+      IRTemp sV        = newTemp(Ity_I64);
+      IRTemp dV        = newTemp(Ity_I64);
+      IRTemp sVoddsSX  = newTemp(Ity_I64);
+      IRTemp sVevensSX = newTemp(Ity_I64);
+      IRTemp dVoddsZX  = newTemp(Ity_I64);
+      IRTemp dVevensZX = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 3+1;
+         DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+                                  nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pmaddubsw %s,%s\n", dis_buf,
+                                  nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      /* compute dV unsigned x sV signed */
+      assign( sVoddsSX,
+              binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
+      assign( sVevensSX,
+              binop(Iop_SarN16x4, 
+                    binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)), 
+                    mkU8(8)) );
+      assign( dVoddsZX,
+              binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
+      assign( dVevensZX,
+              binop(Iop_ShrN16x4,
+                    binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
+                    mkU8(8)) );
+
+      putMMXReg(
+         gregLO3ofRM(modrm),
+         binop(Iop_QAdd16Sx4,
+               binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
+               binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
+      Unsigned Bytes (XMM) */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
+      IRTemp sV        = newTemp(Ity_V128);
+      IRTemp dV        = newTemp(Ity_V128);
+      IRTemp sVoddsSX  = newTemp(Ity_V128);
+      IRTemp sVevensSX = newTemp(Ity_V128);
+      IRTemp dVoddsZX  = newTemp(Ity_V128);
+      IRTemp dVevensZX = newTemp(Ity_V128);
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 3+1;
+         DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pmaddubsw %s,%s\n", dis_buf,
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      /* compute dV unsigned x sV signed */
+      assign( sVoddsSX,
+              binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
+      assign( sVevensSX,
+              binop(Iop_SarN16x8, 
+                    binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)), 
+                    mkU8(8)) );
+      assign( dVoddsZX,
+              binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
+      assign( dVevensZX,
+              binop(Iop_ShrN16x8,
+                    binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
+                    mkU8(8)) );
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm),
+         binop(Iop_QAdd16Sx8,
+               binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
+               binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
+         )
+      );
+      goto decode_success;
+   }
+
+   /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
+   /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
+      mmx) and G to G (mmx). */
+   /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
+      mmx) and G to G (mmx). */
+   /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
+      to G (mmx). */
+   /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
+      to G (mmx). */
+   /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
+      to G (mmx). */
+   /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
+      to G (mmx). */
+
+   if (haveNo66noF2noF3(pfx) 
+       && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
+           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
+      HChar* str    = "???";
+      IROp   opV64  = Iop_INVALID;
+      IROp   opCatO = Iop_CatOddLanes16x4;
+      IROp   opCatE = Iop_CatEvenLanes16x4;
+      IRTemp sV     = newTemp(Ity_I64);
+      IRTemp dV     = newTemp(Ity_I64);
+
+      modrm = insn[3];
+
+      switch (insn[2]) {
+         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
+         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
+         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
+         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
+         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
+         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
+         default: vassert(0);
+      }
+      if (insn[2] == 0x02 || insn[2] == 0x06) {
+         opCatO = Iop_InterleaveHI32x2;
+         opCatE = Iop_InterleaveLO32x2;
+      }
+
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 3+1;
+         DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
+                                  nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("ph%s %s,%s\n", str, dis_buf,
+                                  nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      putMMXReg(
+         gregLO3ofRM(modrm),
+         binop(opV64,
+               binop(opCatE,mkexpr(sV),mkexpr(dV)),
+               binop(opCatO,mkexpr(sV),mkexpr(dV))
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
+      xmm) and G to G (xmm). */
+   /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
+      xmm) and G to G (xmm). */
+   /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
+      G to G (xmm). */
+   /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
+      G to G (xmm). */
+   /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
+      G to G (xmm). */
+   /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
+      G to G (xmm). */
+
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
+           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
+      HChar* str    = "???";
+      IROp   opV64  = Iop_INVALID;
+      IROp   opCatO = Iop_CatOddLanes16x4;
+      IROp   opCatE = Iop_CatEvenLanes16x4;
+      IRTemp sV     = newTemp(Ity_V128);
+      IRTemp dV     = newTemp(Ity_V128);
+      IRTemp sHi    = newTemp(Ity_I64);
+      IRTemp sLo    = newTemp(Ity_I64);
+      IRTemp dHi    = newTemp(Ity_I64);
+      IRTemp dLo    = newTemp(Ity_I64);
+
+      modrm = insn[3];
+
+      switch (insn[2]) {
+         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
+         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
+         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
+         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
+         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
+         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
+         default: vassert(0);
+      }
+      if (insn[2] == 0x02 || insn[2] == 0x06) {
+         opCatO = Iop_InterleaveHI32x2;
+         opCatE = Iop_InterleaveLO32x2;
+      }
+
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
+         DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 3+1;
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("ph%s %s,%s\n", str, dis_buf,
+                             nameXMMReg(gregOfRexRM(pfx,modrm)));
+         delta += 3+alen;
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      /* This isn't a particularly efficient way to compute the
+         result, but at least it avoids a proliferation of IROps,
+         hence avoids complication all the backends. */
+      putXMMReg(
+         gregOfRexRM(pfx,modrm), 
+         binop(Iop_64HLtoV128,
+               binop(opV64,
+                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
+                     binop(opCatO,mkexpr(sHi),mkexpr(sLo))
+               ),
+               binop(opV64,
+                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
+                     binop(opCatO,mkexpr(dHi),mkexpr(dLo))
+               )
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
+      (MMX) */
+   if (haveNo66noF2noF3(pfx) 
+       && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
+      IRTemp sV = newTemp(Ity_I64);
+      IRTemp dV = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 3+1;
+         DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+                                 nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pmulhrsw %s,%s\n", dis_buf,
+                                 nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      putMMXReg(
+         gregLO3ofRM(modrm),
+         dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
+      Scale (XMM) */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
+      IRTemp sV  = newTemp(Ity_V128);
+      IRTemp dV  = newTemp(Ity_V128);
+      IRTemp sHi = newTemp(Ity_I64);
+      IRTemp sLo = newTemp(Ity_I64);
+      IRTemp dHi = newTemp(Ity_I64);
+      IRTemp dLo = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 3+1;
+         DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pmulhrsw %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm),
+         binop(Iop_64HLtoV128,
+               dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
+               dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
+   /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
+   /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
+   if (haveNo66noF2noF3(pfx) 
+       && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
+      IRTemp sV      = newTemp(Ity_I64);
+      IRTemp dV      = newTemp(Ity_I64);
+      HChar* str     = "???";
+      Int    laneszB = 0;
+
+      switch (insn[2]) {
+         case 0x08: laneszB = 1; str = "b"; break;
+         case 0x09: laneszB = 2; str = "w"; break;
+         case 0x0A: laneszB = 4; str = "d"; break;
+         default: vassert(0);
+      }
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 3+1;
+         DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
+                                     nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("psign%s %s,%s\n", str, dis_buf,
+                                     nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      putMMXReg(
+         gregLO3ofRM(modrm),
+         dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
+   /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
+   /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
+      IRTemp sV      = newTemp(Ity_V128);
+      IRTemp dV      = newTemp(Ity_V128);
+      IRTemp sHi     = newTemp(Ity_I64);
+      IRTemp sLo     = newTemp(Ity_I64);
+      IRTemp dHi     = newTemp(Ity_I64);
+      IRTemp dLo     = newTemp(Ity_I64);
+      HChar* str     = "???";
+      Int    laneszB = 0;
+
+      switch (insn[2]) {
+         case 0x08: laneszB = 1; str = "b"; break;
+         case 0x09: laneszB = 2; str = "w"; break;
+         case 0x0A: laneszB = 4; str = "d"; break;
+         default: vassert(0);
+      }
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 3+1;
+         DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("psign%s %s,%s\n", str, dis_buf,
+                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm),
+         binop(Iop_64HLtoV128,
+               dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
+               dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
+   /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
+   /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
+   if (haveNo66noF2noF3(pfx) 
+       && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
+      IRTemp sV      = newTemp(Ity_I64);
+      HChar* str     = "???";
+      Int    laneszB = 0;
+
+      switch (insn[2]) {
+         case 0x1C: laneszB = 1; str = "b"; break;
+         case 0x1D: laneszB = 2; str = "w"; break;
+         case 0x1E: laneszB = 4; str = "d"; break;
+         default: vassert(0);
+      }
+
+      modrm = insn[3];
+      do_MMX_preamble();
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 3+1;
+         DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
+                                    nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pabs%s %s,%s\n", str, dis_buf,
+                                    nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      putMMXReg(
+         gregLO3ofRM(modrm),
+         dis_PABS_helper( mkexpr(sV), laneszB )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
+   /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
+   /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
+      IRTemp sV      = newTemp(Ity_V128);
+      IRTemp sHi     = newTemp(Ity_I64);
+      IRTemp sLo     = newTemp(Ity_I64);
+      HChar* str     = "???";
+      Int    laneszB = 0;
+
+      switch (insn[2]) {
+         case 0x1C: laneszB = 1; str = "b"; break;
+         case 0x1D: laneszB = 2; str = "w"; break;
+         case 0x1E: laneszB = 4; str = "d"; break;
+         default: vassert(0);
+      }
+
+      modrm = insn[3];
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 3+1;
+         DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pabs%s %s,%s\n", str, dis_buf,
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm),
+         binop(Iop_64HLtoV128,
+               dis_PABS_helper( mkexpr(sHi), laneszB ),
+               dis_PABS_helper( mkexpr(sLo), laneszB )
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
+   if (haveNo66noF2noF3(pfx) && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
+      IRTemp sV  = newTemp(Ity_I64);
+      IRTemp dV  = newTemp(Ity_I64);
+      IRTemp res = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         d64 = (Long)insn[3+1];
+         delta += 3+1+1;
+         DIP("palignr $%d,%s,%s\n",  (Int)d64, 
+                                     nameMMXReg(eregLO3ofRM(modrm)),
+                                     nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 1 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         d64 = (Long)insn[3+alen];
+         delta += 3+alen+1;
+         DIP("palignr $%d%s,%s\n", (Int)d64,
+                                   dis_buf,
+                                   nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      if (d64 == 0) {
+         assign( res, mkexpr(sV) );
+      }
+      else if (d64 >= 1 && d64 <= 7) {
+         assign(res, 
+                binop(Iop_Or64,
+                      binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
+                      binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
+                     )));
+      }
+      else if (d64 == 8) {
+        assign( res, mkexpr(dV) );
+      }
+      else if (d64 >= 9 && d64 <= 15) {
+         assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
+      }
+      else if (d64 >= 16 && d64 <= 255) {
+         assign( res, mkU64(0) );
+      }
+      else
+         vassert(0);
+
+      putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
+      goto decode_success;
+   }
+
+   /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
+      IRTemp sV  = newTemp(Ity_V128);
+      IRTemp dV  = newTemp(Ity_V128);
+      IRTemp sHi = newTemp(Ity_I64);
+      IRTemp sLo = newTemp(Ity_I64);
+      IRTemp dHi = newTemp(Ity_I64);
+      IRTemp dLo = newTemp(Ity_I64);
+      IRTemp rHi = newTemp(Ity_I64);
+      IRTemp rLo = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         d64 = (Long)insn[3+1];
+         delta += 3+1+1;
+         DIP("palignr $%d,%s,%s\n", (Int)d64,
+                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 1 );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         d64 = (Long)insn[3+alen];
+         delta += 3+alen+1;
+         DIP("palignr $%d,%s,%s\n", (Int)d64,
+                                    dis_buf,
+                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      if (d64 == 0) {
+         assign( rHi, mkexpr(sHi) );
+         assign( rLo, mkexpr(sLo) );
+      }
+      else if (d64 >= 1 && d64 <= 7) {
+         assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d64) );
+         assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d64) );
+      }
+      else if (d64 == 8) {
+         assign( rHi, mkexpr(dLo) );
+         assign( rLo, mkexpr(sHi) );
+      }
+      else if (d64 >= 9 && d64 <= 15) {
+         assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d64-8) );
+         assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d64-8) );
+      }
+      else if (d64 == 16) {
+         assign( rHi, mkexpr(dHi) );
+         assign( rLo, mkexpr(dLo) );
+      }
+      else if (d64 >= 17 && d64 <= 23) {
+         assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-16))) );
+         assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d64-16) );
+      }
+      else if (d64 == 24) {
+         assign( rHi, mkU64(0) );
+         assign( rLo, mkexpr(dHi) );
+      }
+      else if (d64 >= 25 && d64 <= 31) {
+         assign( rHi, mkU64(0) );
+         assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-24))) );
+      }
+      else if (d64 >= 32 && d64 <= 255) {
+         assign( rHi, mkU64(0) );
+         assign( rLo, mkU64(0) );
+      }
+      else
+         vassert(0);
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm),
+         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
+      );
+      goto decode_success;
+   }
+
+   /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
+   if (haveNo66noF2noF3(pfx) 
+       && sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
+      IRTemp sV      = newTemp(Ity_I64);
+      IRTemp dV      = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
+         delta += 3+1;
+         DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
+                               nameMMXReg(gregLO3ofRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pshufb %s,%s\n", dis_buf,
+                               nameMMXReg(gregLO3ofRM(modrm)));
+      }
+
+      putMMXReg(
+         gregLO3ofRM(modrm),
+         binop(
+            Iop_And64,
+            /* permute the lanes */
+            binop(
+               Iop_Perm8x8,
+               mkexpr(dV),
+               binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
+            ),
+            /* mask off lanes which have (index & 0x80) == 0x80 */
+            unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
+   if (have66noF2noF3(pfx) 
+       && (sz == 2 || /*redundant REX.W*/ sz == 8)
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
+      IRTemp sV         = newTemp(Ity_V128);
+      IRTemp dV         = newTemp(Ity_V128);
+      IRTemp sHi        = newTemp(Ity_I64);
+      IRTemp sLo        = newTemp(Ity_I64);
+      IRTemp dHi        = newTemp(Ity_I64);
+      IRTemp dLo        = newTemp(Ity_I64);
+      IRTemp rHi        = newTemp(Ity_I64);
+      IRTemp rLo        = newTemp(Ity_I64);
+      IRTemp sevens     = newTemp(Ity_I64);
+      IRTemp mask0x80hi = newTemp(Ity_I64);
+      IRTemp mask0x80lo = newTemp(Ity_I64);
+      IRTemp maskBit3hi = newTemp(Ity_I64);
+      IRTemp maskBit3lo = newTemp(Ity_I64);
+      IRTemp sAnd7hi    = newTemp(Ity_I64);
+      IRTemp sAnd7lo    = newTemp(Ity_I64);
+      IRTemp permdHi    = newTemp(Ity_I64);
+      IRTemp permdLo    = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
+         delta += 3+1;
+         DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
+                               nameXMMReg(gregOfRexRM(pfx,modrm)));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pshufb %s,%s\n", dis_buf,
+                               nameXMMReg(gregOfRexRM(pfx,modrm)));
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      assign( sevens, mkU64(0x0707070707070707ULL) );
+
+      /*
+      mask0x80hi = Not(SarN8x8(sHi,7))
+      maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
+      sAnd7hi    = And(sHi,sevens)
+      permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
+                       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
+      rHi        = And(permdHi,mask0x80hi)
+      */
+      assign(
+         mask0x80hi,
+         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
+
+      assign(
+         maskBit3hi,
+         binop(Iop_SarN8x8,
+               binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
+               mkU8(7)));
+
+      assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
+
+      assign(
+         permdHi,
+         binop(
+            Iop_Or64,
+            binop(Iop_And64,
+                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
+                  mkexpr(maskBit3hi)),
+            binop(Iop_And64,
+                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
+                  unop(Iop_Not64,mkexpr(maskBit3hi))) ));
+
+      assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
+
+      /* And the same for the lower half of the result.  What fun. */
+
+      assign(
+         mask0x80lo,
+         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
+
+      assign(
+         maskBit3lo,
+         binop(Iop_SarN8x8,
+               binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
+               mkU8(7)));
+
+      assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
+
+      assign(
+         permdLo,
+         binop(
+            Iop_Or64,
+            binop(Iop_And64,
+                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
+                  mkexpr(maskBit3lo)),
+            binop(Iop_And64,
+                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
+                  unop(Iop_Not64,mkexpr(maskBit3lo))) ));
+
+      assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
+
+      putXMMReg(
+         gregOfRexRM(pfx,modrm),
+         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
+      );
+      goto decode_success;
+   }
+
+   /* ---------------------------------------------------- */
+   /* --- end of the SSSE3 decoder.                    --- */
+   /* ---------------------------------------------------- */
+
+   /* ---------------------------------------------------- */
+   /* --- start of the SSE4 decoder                    --- */
+   /* ---------------------------------------------------- */
+
+   /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
+      Blend Packed Double Precision Floating-Point Values (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0D ) {
+
+      Int imm8;
+      UShort imm8_mask_16;
+
+      IRTemp dst_vec = newTemp(Ity_V128);
+      IRTemp src_vec = newTemp(Ity_V128);
+      IRTemp imm8_mask = newTemp(Ity_V128);
+
+      modrm = insn[3];
+      assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
+
+      if ( epartIsReg( modrm ) ) {
+         imm8 = (Int)insn[4];
+         assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1+1;
+         DIP( "blendpd $%d, %s,%s\n", imm8,
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 
+                          1/* imm8 is 1 byte after the amode */ );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
+         imm8 = (Int)insn[2+alen+1];
+         delta += 3+alen+1;
+         DIP( "blendpd $%d, %s,%s\n", 
+              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      switch( imm8 & 3 ) {
+         case 0:  imm8_mask_16 = 0x0000; break;
+         case 1:  imm8_mask_16 = 0x00FF; break;
+         case 2:  imm8_mask_16 = 0xFF00; break;
+         case 3:  imm8_mask_16 = 0xFFFF; break;
+         default: vassert(0);            break;
+      }
+      assign( imm8_mask, mkV128( imm8_mask_16 ) );
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_OrV128, 
+                        binop( Iop_AndV128, mkexpr(src_vec), mkexpr(imm8_mask) ), 
+                        binop( Iop_AndV128, mkexpr(dst_vec), 
+                               unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
+      Blend Packed Single Precision Floating-Point Values (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0C ) {
+
+      Int imm8;
+      IRTemp dst_vec = newTemp(Ity_V128);
+      IRTemp src_vec = newTemp(Ity_V128);
+
+      modrm = insn[3];
+
+      assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
+
+      if ( epartIsReg( modrm ) ) {
+         imm8 = (Int)insn[3+1];
+         assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1+1;
+         DIP( "blendps $%d, %s,%s\n", imm8,
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );    
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 
+                          1/* imm8 is 1 byte after the amode */ );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
+         imm8 = (Int)insn[3+alen];
+         delta += 3+alen+1;
+         DIP( "blendpd $%d, %s,%s\n", 
+              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00, 0x0F0F, 
+                                0x0FF0, 0x0FFF, 0xF000, 0xF00F, 0xF0F0, 0xF0FF, 
+                                0xFF00, 0xFF0F, 0xFFF0, 0xFFFF };
+      IRTemp imm8_mask = newTemp(Ity_V128);
+      assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_OrV128, 
+                        binop( Iop_AndV128, mkexpr(src_vec), mkexpr(imm8_mask) ),
+                        binop( Iop_AndV128, mkexpr(dst_vec),
+                               unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
+    * Carry-less multiplication of selected XMM quadwords into XMM
+    * registers (a.k.a multiplication of polynomials over GF(2))
+    */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x44 ) {
+  
+      Int imm8;
+      IRTemp svec = newTemp(Ity_V128);
+      IRTemp dvec = newTemp(Ity_V128);
+
+      modrm = insn[3];
+
+      assign( dvec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
+  
+      if ( epartIsReg( modrm ) ) {
+         imm8 = (Int)insn[4];
+         assign( svec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1+1;
+         DIP( "pclmulqdq $%d, %s,%s\n", imm8,
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );    
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 
+                          1/* imm8 is 1 byte after the amode */ );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
+         imm8 = (Int)insn[2+alen+1];
+         delta += 3+alen+1;
+         DIP( "pclmulqdq $%d, %s,%s\n", 
+              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      t0 = newTemp(Ity_I64);
+      t1 = newTemp(Ity_I64);
+      assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64, mkexpr(dvec)));
+      assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64, mkexpr(svec)));
+
+      t2 = newTemp(Ity_I64);
+      t3 = newTemp(Ity_I64);
+
+      IRExpr** args;
+      
+      args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
+      assign(t2,
+              mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
+                                       &amd64g_calculate_pclmul, args));
+      args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
+      assign(t3,
+              mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
+                                       &amd64g_calculate_pclmul, args));
+
+      IRTemp res     = newTemp(Ity_V128);
+      assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
+      putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
+
+      goto decode_success;
+   }
+
+   /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
+      Dot Product of Packed Double Precision Floating-Point Values (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x41 ) {
+  
+      Int imm8;
+      IRTemp src_vec = newTemp(Ity_V128);
+      IRTemp dst_vec = newTemp(Ity_V128);
+      IRTemp and_vec = newTemp(Ity_V128);
+      IRTemp sum_vec = newTemp(Ity_V128);
+
+      modrm = insn[3];
+
+      assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
+  
+      if ( epartIsReg( modrm ) ) {
+         imm8 = (Int)insn[4];
+         assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1+1;
+         DIP( "dppd $%d, %s,%s\n", imm8,
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );    
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 
+                          1/* imm8 is 1 byte after the amode */ );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
+         imm8 = (Int)insn[2+alen+1];
+         delta += 3+alen+1;
+         DIP( "dppd $%d, %s,%s\n", 
+              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
+
+      assign( and_vec, binop( Iop_AndV128,
+                              binop( Iop_Mul64Fx2,
+                                     mkexpr(dst_vec), mkexpr(src_vec) ),
+                              mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
+
+      assign( sum_vec, binop( Iop_Add64F0x2,
+                              binop( Iop_InterleaveHI64x2,
+                                     mkexpr(and_vec), mkexpr(and_vec) ),
+                              binop( Iop_InterleaveLO64x2,
+                                     mkexpr(and_vec), mkexpr(and_vec) ) ) );
+
+      putXMMReg( gregOfRexRM( pfx, modrm ),
+                 binop( Iop_AndV128,
+                        binop( Iop_InterleaveLO64x2,
+                               mkexpr(sum_vec), mkexpr(sum_vec) ),
+                        mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
+      Dot Product of Packed Single Precision Floating-Point Values (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2
+        && insn[0] == 0x0F
+        && insn[1] == 0x3A
+        && insn[2] == 0x40 ) {
+
+      Int imm8;
+      IRTemp xmm1_vec     = newTemp(Ity_V128);
+      IRTemp xmm2_vec     = newTemp(Ity_V128);
+      IRTemp tmp_prod_vec = newTemp(Ity_V128);
+      IRTemp prod_vec     = newTemp(Ity_V128);
+      IRTemp sum_vec      = newTemp(Ity_V128);
+      IRTemp v3, v2, v1, v0;
+      v3 = v2 = v1 = v0   = IRTemp_INVALID;
+
+      modrm = insn[3];
+
+      assign( xmm1_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
+
+      if ( epartIsReg( modrm ) ) {
+         imm8 = (Int)insn[4];
+         assign( xmm2_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1+1;
+         DIP( "dpps $%d, %s,%s\n", imm8,
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );    
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 
+                          1/* imm8 is 1 byte after the amode */ );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( xmm2_vec, loadLE( Ity_V128, mkexpr(addr) ) );
+         imm8 = (Int)insn[2+alen+1];
+         delta += 3+alen+1;
+         DIP( "dpps $%d, %s,%s\n", 
+              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00, 
+                                0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
+                                0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0, 0xFFFF };
+
+      assign( tmp_prod_vec, 
+              binop( Iop_AndV128, 
+                     binop( Iop_Mul32Fx4, mkexpr(xmm1_vec), mkexpr(xmm2_vec) ), 
+                     mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
+      breakup128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
+      assign( prod_vec, mk128from32s( v3, v1, v2, v0 ) );
+
+      assign( sum_vec, binop( Iop_Add32Fx4,
+                              binop( Iop_InterleaveHI32x4, 
+                                     mkexpr(prod_vec), mkexpr(prod_vec) ), 
+                              binop( Iop_InterleaveLO32x4, 
+                                     mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_AndV128, 
+                        binop( Iop_Add32Fx4,
+                               binop( Iop_InterleaveHI32x4,
+                                      mkexpr(sum_vec), mkexpr(sum_vec) ), 
+                               binop( Iop_InterleaveLO32x4,
+                                      mkexpr(sum_vec), mkexpr(sum_vec) ) ), 
+                        mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0F 3A 21 /r ib = INSERTPS xmm1, xmm2/m32, imm8
+      Insert Packed Single Precision Floating-Point Value (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x21 ) {
+
+      Int imm8;
+      Int imm8_count_s;
+      Int imm8_count_d;
+      Int imm8_zmask;
+      IRTemp dstVec   = newTemp(Ity_V128);
+      IRTemp srcDWord = newTemp(Ity_I32);   
+
+      modrm = insn[3];
+
+      assign( dstVec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
+
+      if ( epartIsReg( modrm ) ) {
+         IRTemp src_vec = newTemp(Ity_V128);
+         assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+
+         IRTemp src_lane_0 = IRTemp_INVALID;
+         IRTemp src_lane_1 = IRTemp_INVALID;
+         IRTemp src_lane_2 = IRTemp_INVALID;
+         IRTemp src_lane_3 = IRTemp_INVALID;
+         breakup128to32s( src_vec, 
+                          &src_lane_3, &src_lane_2, &src_lane_1, &src_lane_0 );
+
+         imm8 = (Int)insn[4];
+         imm8_count_s = ((imm8 >> 6) & 3);
+         switch( imm8_count_s ) {
+           case 0:  assign( srcDWord, mkexpr(src_lane_0) ); break;
+           case 1:  assign( srcDWord, mkexpr(src_lane_1) ); break;
+           case 2:  assign( srcDWord, mkexpr(src_lane_2) ); break;
+           case 3:  assign( srcDWord, mkexpr(src_lane_3) ); break;
+           default: vassert(0);                             break;
+         }
+
+         delta += 3+1+1;
+         DIP( "insertps $%d, %s,%s\n", imm8,
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 
+                          1/* const imm8 is 1 byte after the amode */ );
+         assign( srcDWord, loadLE( Ity_I32, mkexpr(addr) ) );
+         imm8 = (Int)insn[2+alen+1];
+         imm8_count_s = 0;
+         delta += 3+alen+1;
+         DIP( "insertps $%d, %s,%s\n", 
+              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      IRTemp dst_lane_0 = IRTemp_INVALID;
+      IRTemp dst_lane_1 = IRTemp_INVALID;
+      IRTemp dst_lane_2 = IRTemp_INVALID;
+      IRTemp dst_lane_3 = IRTemp_INVALID;
+      breakup128to32s( dstVec,
+                       &dst_lane_3, &dst_lane_2, &dst_lane_1, &dst_lane_0 );
+
+      imm8_count_d = ((imm8 >> 4) & 3);
+      switch( imm8_count_d ) {
+         case 0:  dst_lane_0 = srcDWord; break;
+         case 1:  dst_lane_1 = srcDWord; break;
+         case 2:  dst_lane_2 = srcDWord; break;
+         case 3:  dst_lane_3 = srcDWord; break;
+         default: vassert(0);            break;
+      }
+
+      imm8_zmask = (imm8 & 15);
+      IRTemp zero_32 = newTemp(Ity_I32);
+      assign( zero_32, mkU32(0) );
+
+      IRExpr* ire_vec_128 = mk128from32s( 
+                               ((imm8_zmask & 8) == 8) ? zero_32 : dst_lane_3, 
+                               ((imm8_zmask & 4) == 4) ? zero_32 : dst_lane_2, 
+                               ((imm8_zmask & 2) == 2) ? zero_32 : dst_lane_1, 
+                               ((imm8_zmask & 1) == 1) ? zero_32 : dst_lane_0 );
+
+      putXMMReg( gregOfRexRM(pfx, modrm), ire_vec_128 );
+ 
+      goto decode_success;
+   }
+
+
+  /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
+     Extract Byte from xmm, store in mem or zero-extend + store in gen.reg. (XMM) */
+  if ( have66noF2noF3( pfx ) 
+       && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x14 ) {
+
+     Int imm8;
+     IRTemp xmm_vec  = newTemp(Ity_V128);
+     IRTemp sel_lane = newTemp(Ity_I32);
+     IRTemp shr_lane = newTemp(Ity_I32);
+
+     modrm = insn[3];
+     assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
+     breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
+
+     if ( epartIsReg( modrm ) ) {
+        imm8 = (Int)insn[3+1];
+     } else {
+        addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
+        imm8 = (Int)insn[3+alen];
+     }
+     switch( (imm8 >> 2) & 3 ) {
+        case 0:  assign( sel_lane, mkexpr(t0) ); break;
+        case 1:  assign( sel_lane, mkexpr(t1) ); break;
+        case 2:  assign( sel_lane, mkexpr(t2) ); break;
+        case 3:  assign( sel_lane, mkexpr(t3) ); break;
+        default: vassert(0);
+     }
+     assign( shr_lane, 
+             binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
+
+     if ( epartIsReg( modrm ) ) {
+        putIReg64( eregOfRexRM(pfx,modrm), 
+                   unop( Iop_32Uto64, 
+                         binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
+
+        delta += 3+1+1;
+        DIP( "pextrb $%d, %s,%s\n", imm8, 
+             nameXMMReg( gregOfRexRM(pfx, modrm) ), 
+             nameIReg64( eregOfRexRM(pfx, modrm) ) );
+     } else {
+        storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
+        delta += 3+alen+1;
+        DIP( "$%d, pextrb %s,%s\n", 
+             imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
+     }
+
+     goto decode_success;
+  }
+
+
+   /* 66 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
+      Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM) 
+      Note that this insn has the same opcodes as PEXTRQ, but 
+      here the REX.W bit is _not_ present */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2  /* REX.W is _not_ present */
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x16 ) {
+
+      Int imm8_10;
+      IRTemp xmm_vec   = newTemp(Ity_V128);
+      IRTemp src_dword = newTemp(Ity_I32);
+
+      modrm = insn[3];
+      assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
+      breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
+
+      if ( epartIsReg( modrm ) ) {
+         imm8_10 = (Int)(insn[3+1] & 3);
+      } else { 
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
+         imm8_10 = (Int)(insn[3+alen] & 3);
+      }
+
+      switch ( imm8_10 ) {
+         case 0:  assign( src_dword, mkexpr(t0) ); break;
+         case 1:  assign( src_dword, mkexpr(t1) ); break;
+         case 2:  assign( src_dword, mkexpr(t2) ); break;
+         case 3:  assign( src_dword, mkexpr(t3) ); break;
+         default: vassert(0);
+      }
+
+      if ( epartIsReg( modrm ) ) {
+         putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
+         delta += 3+1+1;
+         DIP( "pextrd $%d, %s,%s\n", imm8_10,
+              nameXMMReg( gregOfRexRM(pfx, modrm) ),
+              nameIReg32( eregOfRexRM(pfx, modrm) ) );
+      } else {
+         storeLE( mkexpr(addr), mkexpr(src_dword) );
+         delta += 3+alen+1;
+         DIP( "pextrd $%d, %s,%s\n", 
+              imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
+      }
+
+      goto decode_success;
+   }
+
+
+   /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
+      Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM) 
+      Note that this insn has the same opcodes as PEXTRD, but 
+      here the REX.W bit is present */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 8  /* REX.W is present */
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x16 ) {
+
+      Int imm8_0;
+      IRTemp xmm_vec   = newTemp(Ity_V128);
+      IRTemp src_qword = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
+
+      if ( epartIsReg( modrm ) ) {
+         imm8_0 = (Int)(insn[3+1] & 1);
+      } else {
+         addr   = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
+         imm8_0 = (Int)(insn[3+alen] & 1);
+      }
+      switch ( imm8_0 ) {
+         case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) ); break;
+         case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) ); break;
+         default: vassert(0);
+      }
+
+      if ( epartIsReg( modrm ) ) {
+         putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
+         delta += 3+1+1;
+         DIP( "pextrq $%d, %s,%s\n", imm8_0,
+              nameXMMReg( gregOfRexRM(pfx, modrm) ),
+              nameIReg64( eregOfRexRM(pfx, modrm) ) );
+      } else {
+         storeLE( mkexpr(addr), mkexpr(src_qword) );
+         delta += 3+alen+1;
+         DIP( "pextrq $%d, %s,%s\n", 
+              imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
+      }
+
+      goto decode_success;
+   }
+
+
+   /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
+      Extract Word from xmm, store in mem or zero-extend + store in gen.reg. (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x15 ) {
+
+      Int imm8_20;
+      IRTemp xmm_vec = newTemp(Ity_V128);
+      IRTemp src_word = newTemp(Ity_I16);
+
+      modrm = insn[3];
+      assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
+      breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
+
+      if ( epartIsReg( modrm ) ) {
+         imm8_20 = (Int)(insn[3+1] & 7);
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
+         imm8_20 = (Int)(insn[3+alen] & 7);
+      }
+
+      switch ( imm8_20 ) {
+         case 0:  assign( src_word, unop(Iop_32to16,   mkexpr(t0)) ); break;
+         case 1:  assign( src_word, unop(Iop_32HIto16, mkexpr(t0)) ); break;
+         case 2:  assign( src_word, unop(Iop_32to16,   mkexpr(t1)) ); break;
+         case 3:  assign( src_word, unop(Iop_32HIto16, mkexpr(t1)) ); break;
+         case 4:  assign( src_word, unop(Iop_32to16,   mkexpr(t2)) ); break;
+         case 5:  assign( src_word, unop(Iop_32HIto16, mkexpr(t2)) ); break;
+         case 6:  assign( src_word, unop(Iop_32to16,   mkexpr(t3)) ); break;
+         case 7:  assign( src_word, unop(Iop_32HIto16, mkexpr(t3)) ); break;
+         default: vassert(0);
+      }
+
+      if ( epartIsReg( modrm ) ) {
+         putIReg64( eregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(src_word)) );
+         delta += 3+1+1;
+         DIP( "pextrw $%d, %s,%s\n", imm8_20, 
+              nameXMMReg( gregOfRexRM(pfx, modrm) ),
+              nameIReg64( eregOfRexRM(pfx, modrm) ) );
+      } else {
+         storeLE( mkexpr(addr), mkexpr(src_word) );
+         delta += 3+alen+1;
+         DIP( "pextrw $%d, %s,%s\n", 
+              imm8_20, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
+      }
+
+      goto decode_success;
+   }
+
+
+   /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
+      Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 8  /* REX.W is present */
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x22 ) {
+
+      Int imm8_0;
+      IRTemp src_elems = newTemp(Ity_I64);
+      IRTemp src_vec   = newTemp(Ity_V128);
+
+      modrm = insn[3];
+
+      if ( epartIsReg( modrm ) ) {
+         imm8_0 = (Int)(insn[3+1] & 1);
+         assign( src_elems, getIReg64( eregOfRexRM(pfx,modrm) ) );
+         delta += 3+1+1;
+         DIP( "pinsrq $%d, %s,%s\n", imm8_0,
+              nameIReg64( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
+         imm8_0 = (Int)(insn[3+alen] & 1);
+         assign( src_elems, loadLE( Ity_I64, mkexpr(addr) ) );
+         delta += 3+alen+1;
+         DIP( "pinsrq $%d, %s,%s\n", 
+              imm8_0, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      UShort mask = 0;
+      if ( imm8_0 == 0 ) { 
+         mask = 0xFF00; 
+         assign( src_vec,  binop( Iop_64HLtoV128, mkU64(0), mkexpr(src_elems) ) );
+      } else {
+         mask = 0x00FF;
+         assign( src_vec, binop( Iop_64HLtoV128, mkexpr(src_elems), mkU64(0) ) );
+      }
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_OrV128, mkexpr(src_vec),
+                        binop( Iop_AndV128, 
+                               getXMMReg( gregOfRexRM(pfx, modrm) ),
+                               mkV128(mask) ) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
+      Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 /* REX.W is NOT present */
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x22 ) {
+
+      Int imm8_10;
+      IRTemp src_elems = newTemp(Ity_I32);
+      IRTemp src_vec   = newTemp(Ity_V128);
+      IRTemp z32       = newTemp(Ity_I32);
+
+      modrm = insn[3];
+
+      if ( epartIsReg( modrm ) ) {
+         imm8_10 = (Int)(insn[3+1] & 3);
+         assign( src_elems, getIReg32( eregOfRexRM(pfx,modrm) ) );
+         delta += 3+1+1;
+         DIP( "pinsrd $%d, %s,%s\n", imm8_10,
+              nameIReg32( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
+         imm8_10 = (Int)(insn[3+alen] & 3);
+         assign( src_elems, loadLE( Ity_I32, mkexpr(addr) ) );
+         delta += 3+alen+1;
+         DIP( "pinsrd $%d, %s,%s\n", 
+              imm8_10, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      assign(z32, mkU32(0));
+
+      UShort mask = 0;
+      switch (imm8_10) {
+         case 3:  mask = 0x0FFF;
+                  assign(src_vec, mk128from32s(src_elems, z32, z32, z32));
+                  break;
+         case 2:  mask = 0xF0FF;
+                  assign(src_vec, mk128from32s(z32, src_elems, z32, z32));
+                  break;
+         case 1:  mask = 0xFF0F;
+                  assign(src_vec, mk128from32s(z32, z32, src_elems, z32));
+                  break;
+         case 0:  mask = 0xFFF0;
+                  assign(src_vec, mk128from32s(z32, z32, z32, src_elems));
+                  break;
+         default: vassert(0);
+      }
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_OrV128, mkexpr(src_vec),
+                        binop( Iop_AndV128, 
+                               getXMMReg( gregOfRexRM(pfx, modrm) ),
+                               mkV128(mask) ) ) );
+
+      goto decode_success;
+   }
+
+   /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
+      Extract byte from r32/m8 and insert into xmm1 */
+   if ( have66noF2noF3( pfx )
+        && sz == 2
+        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x20 ) {
+
+      Int    imm8;
+      IRTemp new8 = newTemp(Ity_I64);
+
+      modrm = insn[3];
+
+      if ( epartIsReg( modrm ) ) {
+         imm8 = (Int)(insn[3+1] & 0xF);
+         assign( new8, binop(Iop_And64,
+                             unop(Iop_32Uto64,
+                                  getIReg32(eregOfRexRM(pfx,modrm))),
+                             mkU64(0xFF)));
+         delta += 3+1+1;
+         DIP( "pinsrb $%d,%s,%s\n", imm8,
+              nameIReg32( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
+         imm8 = (Int)(insn[3+alen] & 0xF);
+         assign( new8, unop(Iop_8Uto64, loadLE( Ity_I8, mkexpr(addr) )));
+         delta += 3+alen+1;
+         DIP( "pinsrb $%d,%s,%s\n", 
+              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      // Create a V128 value which has the selected byte in the
+      // specified lane, and zeroes everywhere else.
+      IRTemp tmp128 = newTemp(Ity_V128);
+      IRTemp halfshift = newTemp(Ity_I64);
+      assign(halfshift, binop(Iop_Shl64,
+                              mkexpr(new8), mkU8(8 * (imm8 & 7))));
+      vassert(imm8 >= 0 && imm8 <= 15);
+      if (imm8 < 8) {
+         assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
+      } else {
+         assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
+      }
+
+      UShort mask = ~(1 << imm8);
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_OrV128,
+                        mkexpr(tmp128),
+                        binop( Iop_AndV128, 
+                               getXMMReg( gregOfRexRM(pfx, modrm) ),
+                               mkV128(mask) ) ) );
+
+      goto decode_success;
+   }
+
+   /* 66 0F 38 37 = PCMPGTQ
+      64x2 comparison (signed, presumably; the Intel docs don't say :-)
+   */
+   if ( have66noF2noF3( pfx ) && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x37) {
+      /* FIXME: this needs an alignment check */
+      delta = dis_SSEint_E_to_G( vbi, pfx, delta+3, 
+                                 "pcmpgtq", Iop_CmpGT64Sx2, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
+      Maximum of Packed Signed Double Word Integers (XMM) 
+      66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
+      Minimum of Packed Signed Double Word Integers (XMM) */
+   if ( have66noF2noF3( pfx ) && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38
+        && (insn[2] == 0x3D || insn[2] == 0x39)) {
+      /* FIXME: this needs an alignment check */
+      Bool isMAX = insn[2] == 0x3D;
+      delta = dis_SSEint_E_to_G(
+                 vbi, pfx, delta+3, 
+                 isMAX ? "pmaxsd" : "pminsd",
+                 isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
+                 False
+              );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
+      Maximum of Packed Unsigned Doubleword Integers (XMM)
+      66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
+      Minimum of Packed Unsigned Doubleword Integers (XMM) */
+   if ( have66noF2noF3( pfx ) && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38
+        && (insn[2] == 0x3F || insn[2] == 0x3B)) {
+      /* FIXME: this needs an alignment check */
+      Bool isMAX = insn[2] == 0x3F;
+      delta = dis_SSEint_E_to_G(
+                 vbi, pfx, delta+3, 
+                 isMAX ? "pmaxud" : "pminud",
+                 isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
+                 False
+              );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
+      Maximum of Packed Unsigned Word Integers (XMM)
+      66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
+      Minimum of Packed Unsigned Word Integers (XMM)
+   */
+   if ( have66noF2noF3( pfx ) && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38
+        && (insn[2] == 0x3E || insn[2] == 0x3A)) {
+      /* FIXME: this needs an alignment check */
+      Bool isMAX = insn[2] == 0x3E;
+      delta = dis_SSEint_E_to_G(
+                 vbi, pfx, delta+3, 
+                 isMAX ? "pmaxuw" : "pminuw",
+                 isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
+                 False
+              );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128
+      8Sx16 (signed) max
+      66 0F 38 38 /r = PMINSB xmm1, xmm2/m128
+      8Sx16 (signed) min
+   */
+   if ( have66noF2noF3( pfx ) && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38
+        && (insn[2] == 0x3C || insn[2] == 0x38)) {
+      /* FIXME: this needs an alignment check */
+      Bool isMAX = insn[2] == 0x3C;
+      delta = dis_SSEint_E_to_G(
+                 vbi, pfx, delta+3, 
+                 isMAX ? "pmaxsb" : "pminsb",
+                 isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
+                 False
+              );
+      goto decode_success;
+   }
+
+   /* 66 0f 38 20 /r = PMOVSXBW xmm1, xmm2/m64 
+      Packed Move with Sign Extend from Byte to Word (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x20 ) {
+  
+      modrm = insn[3];
+
+      IRTemp srcVec = newTemp(Ity_V128);
+
+      if ( epartIsReg( modrm ) ) {
+         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1;
+         DIP( "pmovsxbw %s,%s\n",
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else { 
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( srcVec, 
+                 unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
+         delta += 3+alen;
+         DIP( "pmovsxbw %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+     
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_SarN16x8, 
+                        binop( Iop_ShlN16x8, 
+                               binop( Iop_InterleaveLO8x16,
+                                      IRExpr_Const( IRConst_V128(0) ),
+                                      mkexpr(srcVec) ),
+                               mkU8(8) ),
+                        mkU8(8) ) );
+     
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 21 /r = PMOVSXBD xmm1, xmm2/m32 
+      Packed Move with Sign Extend from Byte to DWord (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x21 ) {
+  
+      modrm = insn[3];
+
+      IRTemp srcVec = newTemp(Ity_V128);
+
+      if ( epartIsReg( modrm ) ) {
+         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1;
+         DIP( "pmovsxbd %s,%s\n",
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) )  );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( srcVec, 
+                 unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
+         delta += 3+alen;
+         DIP( "pmovsxbd %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      IRTemp zeroVec = newTemp(Ity_V128);
+      assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_SarN32x4, 
+                        binop( Iop_ShlN32x4, 
+                               binop( Iop_InterleaveLO8x16, 
+                                      mkexpr(zeroVec), 
+                                      binop( Iop_InterleaveLO8x16, 
+                                             mkexpr(zeroVec), 
+                                             mkexpr(srcVec) ) ), 
+                               mkU8(24) ), mkU8(24) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
+      Packed Move with Sign Extend from Byte to QWord (XMM) */
+   if ( have66noF2noF3(pfx) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x22 ) {
+     
+      modrm = insn[3];
+
+      IRTemp srcBytes = newTemp(Ity_I16);
+
+      if ( epartIsReg(modrm) ) {
+         assign( srcBytes, getXMMRegLane16( eregOfRexRM(pfx, modrm), 0 ) );
+         delta += 3+1;
+         DIP( "pmovsxbq %s,%s\n",
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
+         delta += 3+alen;
+         DIP( "pmovsxbq %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      putXMMReg( gregOfRexRM( pfx, modrm ),
+                 binop( Iop_64HLtoV128,
+                        unop( Iop_8Sto64,
+                              unop( Iop_16HIto8,
+                                    mkexpr(srcBytes) ) ),
+                        unop( Iop_8Sto64,
+                              unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 23 /r = PMOVSXWD xmm1, xmm2/m64 
+      Packed Move with Sign Extend from Word to DWord (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x23 ) {
+
+      modrm = insn[3];
+
+      IRTemp srcVec = newTemp(Ity_V128);
+
+      if ( epartIsReg(modrm) ) {
+         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1;
+         DIP( "pmovsxwd %s,%s\n", 
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( srcVec, 
+                 unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
+         delta += 3+alen;
+         DIP( "pmovsxwd %s,%s\n", 
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_SarN32x4, 
+                        binop( Iop_ShlN32x4, 
+                               binop( Iop_InterleaveLO16x8, 
+                                      IRExpr_Const( IRConst_V128(0) ), 
+                                      mkexpr(srcVec) ), 
+                               mkU8(16) ), 
+                        mkU8(16) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
+      Packed Move with Sign Extend from Word to QWord (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x24 ) {
+
+      modrm = insn[3];
+
+      IRTemp srcBytes = newTemp(Ity_I32);
+
+      if ( epartIsReg( modrm ) ) {
+         assign( srcBytes, getXMMRegLane32( eregOfRexRM(pfx, modrm), 0 ) );
+         delta += 3+1;
+         DIP( "pmovsxwq %s,%s\n", 
+              nameXMMReg( eregOfRexRM(pfx, modrm) ), 
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
+         delta += 3+alen;
+         DIP( "pmovsxwq %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      putXMMReg( gregOfRexRM( pfx, modrm ), 
+                 binop( Iop_64HLtoV128, 
+                        unop( Iop_16Sto64, 
+                              unop( Iop_32HIto16, mkexpr(srcBytes) ) ), 
+                        unop( Iop_16Sto64, 
+                              unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
+      Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
+   if ( have66noF2noF3( pfx )
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x25 ) {
+  
+      modrm = insn[3];
+
+      IRTemp srcBytes = newTemp(Ity_I64);
+
+      if ( epartIsReg(modrm) ) {
+         assign( srcBytes, getXMMRegLane64( eregOfRexRM(pfx, modrm), 0 ) );
+         delta += 3+1;
+         DIP( "pmovsxdq %s,%s\n", 
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) );
+         delta += 3+alen;
+         DIP( "pmovsxdq %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_64HLtoV128, 
+                        unop( Iop_32Sto64, 
+                              unop( Iop_64HIto32, mkexpr(srcBytes) ) ), 
+                        unop( Iop_32Sto64, 
+                              unop( Iop_64to32, mkexpr(srcBytes) ) ) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 30 /r = PMOVZXBW xmm1, xmm2/m64 
+      Packed Move with Zero Extend from Byte to Word (XMM) */
+   if ( have66noF2noF3(pfx) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x30 ) {
+  
+      modrm = insn[3];
+
+      IRTemp srcVec = newTemp(Ity_V128);
+
+      if ( epartIsReg(modrm) ) {
+         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1;
+         DIP( "pmovzxbw %s,%s\n",
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( srcVec, 
+                 unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
+         delta += 3+alen;
+         DIP( "pmovzxbw %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_InterleaveLO8x16, 
+                        IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 31 /r = PMOVZXBD xmm1, xmm2/m32 
+      Packed Move with Zero Extend from Byte to DWord (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x31 ) {
+  
+      modrm = insn[3];
+
+      IRTemp srcVec = newTemp(Ity_V128);
+
+      if ( epartIsReg(modrm) ) {
+         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1;
+         DIP( "pmovzxbd %s,%s\n",
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( srcVec, 
+                 unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
+         delta += 3+alen;
+         DIP( "pmovzxbd %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      IRTemp zeroVec = newTemp(Ity_V128);
+      assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
+
+      putXMMReg( gregOfRexRM( pfx, modrm ), 
+                 binop( Iop_InterleaveLO8x16,
+                        mkexpr(zeroVec),
+                        binop( Iop_InterleaveLO8x16, 
+                               mkexpr(zeroVec), mkexpr(srcVec) ) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
+      Packed Move with Zero Extend from Byte to QWord (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x32 ) {
+
+      modrm = insn[3];
+
+      IRTemp srcVec = newTemp(Ity_V128);
+
+      if ( epartIsReg(modrm) ) {
+         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1;
+         DIP( "pmovzxbq %s,%s\n",
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( srcVec, 
+                 unop( Iop_32UtoV128, 
+                       unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ) ) ) );
+         delta += 3+alen;
+         DIP( "pmovzxbq %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      IRTemp zeroVec = newTemp(Ity_V128);
+      assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
+
+      putXMMReg( gregOfRexRM( pfx, modrm ), 
+                 binop( Iop_InterleaveLO8x16, 
+                        mkexpr(zeroVec), 
+                        binop( Iop_InterleaveLO8x16, 
+                               mkexpr(zeroVec), 
+                               binop( Iop_InterleaveLO8x16, 
+                                      mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 33 /r = PMOVZXWD xmm1, xmm2/m64 
+      Packed Move with Zero Extend from Word to DWord (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x33 ) {
+  
+      modrm = insn[3];
+
+      IRTemp srcVec = newTemp(Ity_V128);
+
+      if ( epartIsReg(modrm) ) {
+         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1;
+         DIP( "pmovzxwd %s,%s\n", 
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( srcVec, 
+                 unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
+         delta += 3+alen;
+         DIP( "pmovzxwd %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_InterleaveLO16x8,  
+                        IRExpr_Const( IRConst_V128(0) ),
+                        mkexpr(srcVec) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
+      Packed Move with Zero Extend from Word to QWord (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x34 ) {
+  
+      modrm = insn[3];
+
+      IRTemp srcVec = newTemp(Ity_V128);
+
+      if ( epartIsReg( modrm ) ) {
+         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1;
+         DIP( "pmovzxwq %s,%s\n", 
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( srcVec, 
+                 unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
+         delta += 3+alen;
+         DIP( "pmovzxwq %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      IRTemp zeroVec = newTemp( Ity_V128 );
+      assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
+
+      putXMMReg( gregOfRexRM( pfx, modrm ),
+                 binop( Iop_InterleaveLO16x8, 
+                        mkexpr(zeroVec), 
+                        binop( Iop_InterleaveLO16x8, 
+                               mkexpr(zeroVec), mkexpr(srcVec) ) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
+      Packed Move with Zero Extend from DWord to QWord (XMM) */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x35 ) {
+  
+      modrm = insn[3];
+
+      IRTemp srcVec = newTemp(Ity_V128);
+
+      if ( epartIsReg(modrm) ) {
+         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1;
+         DIP( "pmovzxdq %s,%s\n", 
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( srcVec, 
+                 unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
+         delta += 3+alen;
+         DIP( "pmovzxdq %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_InterleaveLO32x4, 
+                        IRExpr_Const( IRConst_V128(0) ), 
+                        mkexpr(srcVec) ) );
+
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 40 /r = PMULLD xmm1, xmm2/m128
+      32x4 integer multiply from xmm2/m128 to xmm1 */
+   if ( have66noF2noF3( pfx ) 
+        && sz == 2 
+        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x40 ) {
+  
+      modrm = insn[3];
+
+      IRTemp argL = newTemp(Ity_V128);
+      IRTemp argR = newTemp(Ity_V128);
+
+      if ( epartIsReg(modrm) ) {
+         assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
+         delta += 3+1;
+         DIP( "pmulld %s,%s\n",
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
+         delta += 3+alen;
+         DIP( "pmulld %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
+
+      putXMMReg( gregOfRexRM(pfx, modrm), 
+                 binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
+
+      goto decode_success;
+   }
+
+
+   /* F3 0F B8  = POPCNT{W,L,Q}
+      Count the number of 1 bits in a register
+    */
+   if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
+       && insn[0] == 0x0F && insn[1] == 0xB8) {
+      vassert(sz == 2 || sz == 4 || sz == 8);
+      /*IRType*/ ty  = szToITy(sz);
+      IRTemp     src = newTemp(ty);
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign(src, getIRegE(sz, pfx, modrm));
+         delta += 2+1;
+         DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
+             nameIRegG(sz, pfx, modrm));
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0);
+         assign(src, loadLE(ty, mkexpr(addr)));
+         delta += 2+alen;
+         DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
+             nameIRegG(sz, pfx, modrm));
+      }
+
+      IRTemp result = gen_POPCOUNT(ty, src);
+      putIRegG(sz, pfx, modrm, mkexpr(result));
+
+      // Update flags.  This is pretty lame .. perhaps can do better
+      // if this turns out to be performance critical.
+      // O S A C P are cleared.  Z is set if SRC == 0.
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP1,
+            binop(Iop_Shl64,
+                  unop(Iop_1Uto64,
+                       binop(Iop_CmpEQ64,
+                             widenUto64(mkexpr(src)),
+                             mkU64(0))),
+                  mkU8(AMD64G_CC_SHIFT_Z))));
+
+      goto decode_success;
+   }
+
+
+   /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
+      (Partial implementation only -- only deal with cases where
+      the rounding mode is specified directly by the immediate byte.)
+      66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
+      (Limitations ditto)
+   */
+   if (have66noF2noF3(pfx) 
+       && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x3A
+       && (insn[2] == 0x0B || insn[2] == 0x0A)) {
+
+      Bool   isD = insn[2] == 0x0B;
+      IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
+      IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
+      Int    imm = 0;
+
+      modrm = insn[3];
+
+      if (epartIsReg(modrm)) {
+         assign( src, 
+                 isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
+                     : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
+         imm = insn[3+1];
+         if (imm & ~3) goto decode_failure;
+         delta += 3+1+1;
+         DIP( "rounds%c $%d,%s,%s\n",
+              isD ? 'd' : 's',
+              imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
+                   nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
+         imm = insn[3+alen];
+         if (imm & ~3) goto decode_failure;
+         delta += 3+alen+1;
+         DIP( "roundsd $%d,%s,%s\n",
+              imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      /* (imm & 3) contains an Intel-encoded rounding mode.  Because
+         that encoding is the same as the encoding for IRRoundingMode,
+         we can use that value directly in the IR as a rounding
+         mode. */
+      assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
+                  mkU32(imm & 3), mkexpr(src)) );
+
+      if (isD)
+         putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
+      else
+         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
+
+      goto decode_success;
+   }
+
+   /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
+      which we can only decode if we're sure this is an AMD cpu that
+      supports LZCNT, since otherwise it's BSR, which behaves
+      differently. */
+   if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
+       && insn[0] == 0x0F && insn[1] == 0xBD
+       && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
+      vassert(sz == 2 || sz == 4 || sz == 8);
+      /*IRType*/ ty  = szToITy(sz);
+      IRTemp     src = newTemp(ty);
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign(src, getIRegE(sz, pfx, modrm));
+         delta += 2+1;
+         DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
+             nameIRegG(sz, pfx, modrm));
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0);
+         assign(src, loadLE(ty, mkexpr(addr)));
+         delta += 2+alen;
+         DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
+             nameIRegG(sz, pfx, modrm));
+      }
+
+      IRTemp res = gen_LZCNT(ty, src);
+      putIRegG(sz, pfx, modrm, mkexpr(res));
+
+      // Update flags.  This is pretty lame .. perhaps can do better
+      // if this turns out to be performance critical.
+      // O S A P are cleared.  Z is set if RESULT == 0.
+      // C is set if SRC is zero.
+      IRTemp src64 = newTemp(Ity_I64);
+      IRTemp res64 = newTemp(Ity_I64);
+      assign(src64, widenUto64(mkexpr(src)));
+      assign(res64, widenUto64(mkexpr(res)));
+
+      IRTemp oszacp = newTemp(Ity_I64);
+      assign(
+         oszacp,
+         binop(Iop_Or64,
+               binop(Iop_Shl64,
+                     unop(Iop_1Uto64,
+                          binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
+                     mkU8(AMD64G_CC_SHIFT_Z)),
+               binop(Iop_Shl64,
+                     unop(Iop_1Uto64,
+                          binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
+                     mkU8(AMD64G_CC_SHIFT_C))
+         )
+      );
+
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
+
+      goto decode_success;
+   }
+
+   /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
+      66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
+      66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
+      66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
+      (selected special cases that actually occur in glibc,
+       not by any means a complete implementation.)
+   */
+   if (have66noF2noF3(pfx) 
+       && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x3A
+       && (insn[2] >= 0x60 && insn[2] <= 0x63)) {
+
+      UInt  isISTRx = insn[2] & 2;
+      UInt  isxSTRM = (insn[2] & 1) ^ 1;
+      UInt  regNoL = 0;
+      UInt  regNoR = 0;
+      UChar imm    = 0;
+
+      /* This is a nasty kludge.  We need to pass 2 x V128 to the
+         helper (which is clean).  Since we can't do that, use a dirty
+         helper to compute the results directly from the XMM regs in
+         the guest state.  That means for the memory case, we need to
+         move the left operand into a pseudo-register (XMM16, let's
+         call it). */
+      modrm = insn[3];
+      if (epartIsReg(modrm)) {
+         regNoL = eregOfRexRM(pfx, modrm);
+         regNoR = gregOfRexRM(pfx, modrm);
+         imm = insn[3+1];
+         delta += 3+1+1;
+      } else {
+         regNoL = 16; /* use XMM16 as an intermediary */
+         regNoR = gregOfRexRM(pfx, modrm);
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         /* No alignment check; I guess that makes sense, given that
+            these insns are for dealing with C style strings. */
+         stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) ));
+         imm = insn[3+alen];
+         delta += 3+alen+1;
+      }
+
+      /* Now we know the XMM reg numbers for the operands, and the
+         immediate byte.  Is it one we can actually handle? Throw out
+         any cases for which the helper function has not been
+         verified. */
+      switch (imm) {
+         case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12:
+         case 0x1A: case 0x3A: case 0x44: case 0x4A:
+            break;
+         default:
+            goto decode_failure;
+      }
+
+      /* Who ya gonna call?  Presumably not Ghostbusters. */
+      void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
+      HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
+
+      /* Round up the arguments.  Note that this is a kludge -- the
+         use of mkU64 rather than mkIRExpr_HWord implies the
+         assumption that the host's word size is 64-bit. */
+      UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL);
+      UInt gstOffR = xmmGuestRegOffset(regNoR);
+
+      IRExpr*  opc4_and_imm = mkU64((insn[2] << 8) | (imm & 0xFF));
+      IRExpr*  gstOffLe     = mkU64(gstOffL);
+      IRExpr*  gstOffRe     = mkU64(gstOffR);
+      IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
+      IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
+      IRExpr** args
+         = mkIRExprVec_5( opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
+
+      IRTemp   resT = newTemp(Ity_I64);
+      IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
+      /* It's not really a dirty call, but we can't use the clean
+         helper mechanism here for the very lame reason that we can't
+         pass 2 x V128s by value to a helper, nor get one back.  Hence
+         this roundabout scheme. */
+      d->needsBBP = True;
+      d->nFxState = 2;
+      d->fxState[0].fx     = Ifx_Read;
+      d->fxState[0].offset = gstOffL;
+      d->fxState[0].size   = sizeof(U128);
+      d->fxState[1].fx     = Ifx_Read;
+      d->fxState[1].offset = gstOffR;
+      d->fxState[1].size   = sizeof(U128);
+      if (isxSTRM) {
+         /* Declare that the helper writes XMM0. */
+         d->nFxState = 3;
+         d->fxState[2].fx     = Ifx_Write;
+         d->fxState[2].offset = xmmGuestRegOffset(0);
+         d->fxState[2].size   = sizeof(U128);
+      }
+
+      stmt( IRStmt_Dirty(d) );
+
+      /* Now resT[15:0] holds the new OSZACP values, so the condition
+         codes must be updated. And for a xSTRI case, resT[31:16]
+         holds the new ECX value, so stash that too. */
+      if (!isxSTRM) {
+         putIReg64(R_RCX, binop(Iop_And64,
+                                binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
+                                mkU64(0xFFFF)));
+      }
+
+      stmt( IRStmt_Put(
+               OFFB_CC_DEP1,
+               binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
+      ));
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+
+      if (regNoL == 16) {
+         DIP("pcmp%cstr%c $%x,%s,%s\n",
+             isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
+             (UInt)imm, dis_buf, nameXMMReg(regNoR));
+      } else {
+         DIP("pcmp%cstr%c $%x,%s,%s\n",
+             isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
+             (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
+      }
+
+      goto decode_success;
+   }
+
+
+   /* 66 0f 38 17 /r = PTEST xmm1, xmm2/m128
+      Logical compare (set ZF and CF from AND/ANDN of the operands) */
+   if (have66noF2noF3( pfx ) && sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x17) {
+      modrm = insn[3];
+      IRTemp vecE = newTemp(Ity_V128);
+      IRTemp vecG = newTemp(Ity_V128);
+
+      if ( epartIsReg(modrm) ) {
+         assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
+         delta += 3+1;
+         DIP( "ptest %s,%s\n", 
+              nameXMMReg( eregOfRexRM(pfx, modrm) ),
+              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      } else {
+         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
+         delta += 3+alen;
+         DIP( "ptest %s,%s\n",
+              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
+      }
+
+      assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
+
+      /* Set Z=1 iff (vecE & vecG) == 0
+         Set C=1 iff (vecE & not vecG) == 0
+      */
+
+      /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
+      IRTemp andV  = newTemp(Ity_V128);
+      IRTemp andnV = newTemp(Ity_V128);
+      assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
+      assign(andnV, binop(Iop_AndV128,
+                          mkexpr(vecE),
+                          binop(Iop_XorV128, mkexpr(vecG),
+                                             mkV128(0xFFFF))));
+
+      /* The same, but reduced to 64-bit values, by or-ing the top
+         and bottom 64-bits together.  It relies on this trick:
+
+          InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
+
+          InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
+          InterleaveHI64x2([a,b],[a,b]) == [a,a] 
+
+          and so the OR of the above 2 exprs produces
+          [a OR b, a OR b], from which we simply take the lower half.
+      */
+      IRTemp and64  = newTemp(Ity_I64);
+      IRTemp andn64 = newTemp(Ity_I64);
+   
+      assign(
+         and64,
+         unop(Iop_V128to64,
+              binop(Iop_OrV128,
+                    binop(Iop_InterleaveLO64x2, mkexpr(andV), mkexpr(andV)),
+                    binop(Iop_InterleaveHI64x2, mkexpr(andV), mkexpr(andV))
+              )
+         )
+      );
+
+      assign(
+         andn64,
+         unop(Iop_V128to64,
+              binop(Iop_OrV128,
+                    binop(Iop_InterleaveLO64x2, mkexpr(andnV), mkexpr(andnV)),
+                    binop(Iop_InterleaveHI64x2, mkexpr(andnV), mkexpr(andnV))
+              )
+          )
+       );
+
+      /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
+         slice out the Z and C bits conveniently.  We use the standard
+         trick all-zeroes -> all-zeroes, anything-else -> all-ones
+         done by "(x | -x) >>s (word-size - 1)".
+      */
+      IRTemp z64 = newTemp(Ity_I64);
+      IRTemp c64 = newTemp(Ity_I64);
+      assign(z64,
+             unop(Iop_Not64,
+                  binop(Iop_Sar64,
+                        binop(Iop_Or64,
+                              binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
+                              mkexpr(and64)
+                        ), 
+                        mkU8(63)))
+      );
+
+      assign(c64,
+             unop(Iop_Not64,
+                  binop(Iop_Sar64,
+                        binop(Iop_Or64,
+                              binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
+                              mkexpr(andn64)
+                        ),
+                        mkU8(63)))
+      );
+
+      /* And finally, slice out the Z and C flags and set the flags
+         thunk to COPY for them.  OSAP are set to zero. */
+      IRTemp newOSZACP = newTemp(Ity_I64);
+      assign(newOSZACP, 
+             binop(Iop_Or64,
+                   binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
+                   binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))
+             )
+      );
+
+      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+
+      goto decode_success;
+   }
+
+
+   /* ---------------------------------------------------- */
+   /* --- end of the SSE4 decoder                      --- */
+   /* ---------------------------------------------------- */
+
+   /*after_sse_decoders:*/
+
+   /* Get the primary opcode. */
+   opc = getUChar(delta); delta++;
+
+   /* We get here if the current insn isn't SSE, or this CPU doesn't
+      support SSE. */
+
+   switch (opc) {
+
+   /* ------------------------ Control flow --------------- */
+
+   case 0xC2: /* RET imm16 */
+      if (have66orF2orF3(pfx)) goto decode_failure;
+      d64 = getUDisp16(delta); 
+      delta += 2;
+      dis_ret(vbi, d64);
+      dres.whatNext = Dis_StopHere;
+      DIP("ret %lld\n", d64);
+      break;
+
+   case 0xC3: /* RET */
+      if (have66orF2(pfx)) goto decode_failure;
+      /* F3 is acceptable on AMD. */
+      dis_ret(vbi, 0);
+      dres.whatNext = Dis_StopHere;
+      DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
+      break;
+      
+   case 0xE8: /* CALL J4 */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      d64 = getSDisp32(delta); delta += 4;
+      d64 += (guest_RIP_bbstart+delta); 
+      /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
+      t1 = newTemp(Ity_I64); 
+      assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
+      putIReg64(R_RSP, mkexpr(t1));
+      storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
+      t2 = newTemp(Ity_I64);
+      assign(t2, mkU64((Addr64)d64));
+      make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
+      if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
+         /* follow into the call target. */
+         dres.whatNext   = Dis_ResteerU;
+         dres.continueAt = d64;
+      } else {
+         jmp_lit(Ijk_Call,d64);
+         dres.whatNext = Dis_StopHere;
+      }
+      DIP("call 0x%llx\n",d64);
+      break;
+
+//.. //--    case 0xC8: /* ENTER */ 
+//.. //--       d32 = getUDisp16(eip); eip += 2;
+//.. //--       abyte = getUChar(delta); delta++;
+//.. //-- 
+//.. //--       vg_assert(sz == 4);           
+//.. //--       vg_assert(abyte == 0);
+//.. //-- 
+//.. //--       t1 = newTemp(cb); t2 = newTemp(cb);
+//.. //--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
+//.. //--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
+//.. //--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
+//.. //--       uLiteral(cb, sz);
+//.. //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
+//.. //--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
+//.. //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
+//.. //--       if (d32) {
+//.. //--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
+//.. //--          uLiteral(cb, d32);
+//.. //--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
+//.. //--       }
+//.. //--       DIP("enter 0x%x, 0x%x", d32, abyte);
+//.. //--       break;
+
+   case 0xC9: /* LEAVE */
+      /* In 64-bit mode this defaults to a 64-bit operand size.  There
+         is no way to encode a 32-bit variant.  Hence sz==4 but we do
+         it as if sz=8. */
+      if (sz != 4) 
+         goto decode_failure;
+      t1 = newTemp(Ity_I64); 
+      t2 = newTemp(Ity_I64);
+      assign(t1, getIReg64(R_RBP));
+      /* First PUT RSP looks redundant, but need it because RSP must
+         always be up-to-date for Memcheck to work... */
+      putIReg64(R_RSP, mkexpr(t1));
+      assign(t2, loadLE(Ity_I64,mkexpr(t1)));
+      putIReg64(R_RBP, mkexpr(t2));
+      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
+      DIP("leave\n");
+      break;
+
+//.. //--    /* ---------------- Misc weird-ass insns --------------- */
+//.. //-- 
+//.. //--    case 0x27: /* DAA */
+//.. //--    case 0x2F: /* DAS */
+//.. //--       t1 = newTemp(cb);
+//.. //--       uInstr2(cb, GET, 1, ArchReg, R_AL, TempReg, t1);
+//.. //--       /* Widen %AL to 32 bits, so it's all defined when we push it. */
+//.. //--       uInstr1(cb, WIDEN, 4, TempReg, t1);
+//.. //--       uWiden(cb, 1, False);
+//.. //--       uInstr0(cb, CALLM_S, 0);
+//.. //--       uInstr1(cb, PUSH, 4, TempReg, t1);
+//.. //--       uInstr1(cb, CALLM, 0, Lit16, 
+//.. //--                   opc == 0x27 ? VGOFF_(helper_DAA) : VGOFF_(helper_DAS) );
+//.. //--       uFlagsRWU(cb, FlagsAC, FlagsSZACP, FlagO);
+//.. //--       uInstr1(cb, POP, 4, TempReg, t1);
+//.. //--       uInstr0(cb, CALLM_E, 0);
+//.. //--       uInstr2(cb, PUT, 1, TempReg, t1, ArchReg, R_AL);
+//.. //--       DIP(opc == 0x27 ? "daa\n" : "das\n");
+//.. //--       break;
+//.. //-- 
+//.. //--    case 0x37: /* AAA */
+//.. //--    case 0x3F: /* AAS */
+//.. //--       t1 = newTemp(cb);
+//.. //--       uInstr2(cb, GET, 2, ArchReg, R_EAX, TempReg, t1);
+//.. //--       /* Widen %AL to 32 bits, so it's all defined when we push it. */
+//.. //--       uInstr1(cb, WIDEN, 4, TempReg, t1);
+//.. //--       uWiden(cb, 2, False);
+//.. //--       uInstr0(cb, CALLM_S, 0);
+//.. //--       uInstr1(cb, PUSH, 4, TempReg, t1);
+//.. //--       uInstr1(cb, CALLM, 0, Lit16, 
+//.. //--                   opc == 0x37 ? VGOFF_(helper_AAA) : VGOFF_(helper_AAS) );
+//.. //--       uFlagsRWU(cb, FlagA, FlagsAC, FlagsEmpty);
+//.. //--       uInstr1(cb, POP, 4, TempReg, t1);
+//.. //--       uInstr0(cb, CALLM_E, 0);
+//.. //--       uInstr2(cb, PUT, 2, TempReg, t1, ArchReg, R_EAX);
+//.. //--       DIP(opc == 0x37 ? "aaa\n" : "aas\n");
+//.. //--       break;
+//.. //-- 
+//.. //--    case 0xD4: /* AAM */
+//.. //--    case 0xD5: /* AAD */
+//.. //--       d32 = getUChar(delta); delta++;
+//.. //--       if (d32 != 10) VG_(core_panic)("disInstr: AAM/AAD but base not 10 !");
+//.. //--       t1 = newTemp(cb);
+//.. //--       uInstr2(cb, GET, 2, ArchReg, R_EAX, TempReg, t1);
+//.. //--       /* Widen %AX to 32 bits, so it's all defined when we push it. */
+//.. //--       uInstr1(cb, WIDEN, 4, TempReg, t1);
+//.. //--       uWiden(cb, 2, False);
+//.. //--       uInstr0(cb, CALLM_S, 0);
+//.. //--       uInstr1(cb, PUSH, 4, TempReg, t1);
+//.. //--       uInstr1(cb, CALLM, 0, Lit16, 
+//.. //--                   opc == 0xD4 ? VGOFF_(helper_AAM) : VGOFF_(helper_AAD) );
+//.. //--       uFlagsRWU(cb, FlagsEmpty, FlagsSZP, FlagsEmpty);
+//.. //--       uInstr1(cb, POP, 4, TempReg, t1);
+//.. //--       uInstr0(cb, CALLM_E, 0);
+//.. //--       uInstr2(cb, PUT, 2, TempReg, t1, ArchReg, R_EAX);
+//.. //--       DIP(opc == 0xD4 ? "aam\n" : "aad\n");
+//.. //--       break;
+
+   /* ------------------------ CWD/CDQ -------------------- */
+
+   case 0x98: /* CBW */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      if (sz == 8) {
+         putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
+         DIP(/*"cdqe\n"*/"cltq");
+         break;
+      }
+      if (sz == 4) {
+         putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
+         DIP("cwtl\n");
+         break;
+      }
+      if (sz == 2) {
+         putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
+         DIP("cbw\n");
+         break;
+      }
+      goto decode_failure;
+
+   case 0x99: /* CWD/CDQ/CQO */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      vassert(sz == 2 || sz == 4 || sz == 8);
+      ty = szToITy(sz);
+      putIRegRDX( sz, 
+                  binop(mkSizedOp(ty,Iop_Sar8), 
+                        getIRegRAX(sz),
+                        mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
+      DIP(sz == 2 ? "cwd\n" 
+                  : (sz == 4 ? /*"cdq\n"*/ "cltd\n" 
+                             : "cqo\n"));
+      break;
+
+   /* ------------------------ FPU ops -------------------- */
+
+   case 0x9E: /* SAHF */
+      codegen_SAHF();
+      DIP("sahf\n");
+      break;
+
+   case 0x9F: /* LAHF */
+      codegen_LAHF();
+      DIP("lahf\n");
+      break;
+
+   case 0x9B: /* FWAIT */
+      /* ignore? */
+      DIP("fwait\n");
+      break;
+
+   case 0xD8:
+   case 0xD9:
+   case 0xDA:
+   case 0xDB:
+   case 0xDC:
+   case 0xDD:
+   case 0xDE:
+   case 0xDF: {
+      Bool redundantREXWok = False;
+
+      if (haveF2orF3(pfx)) 
+         goto decode_failure;
+
+      /* kludge to tolerate redundant rex.w prefixes (should do this
+         properly one day) */
+      /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
+      if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
+         redundantREXWok = True;
+
+      if ( (sz == 4
+           || (sz == 8 && redundantREXWok))
+           && haveNo66noF2noF3(pfx)) {
+         Long delta0    = delta;
+         Bool decode_OK = False;
+         delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
+         if (!decode_OK) {
+            delta = delta0;
+            goto decode_failure;
+         }
+         break;
+      } else {
+         goto decode_failure;
+      }
+   }
+
+   /* ------------------------ INT ------------------------ */
+
+   case 0xCC: /* INT 3 */
+      jmp_lit(Ijk_SigTRAP, guest_RIP_bbstart + delta);
+      dres.whatNext = Dis_StopHere;
+      DIP("int $0x3\n");
+      break;
+
+   case 0xCD: { /* INT imm8 */
+      IRJumpKind jk = Ijk_Boring;
+      if (have66orF2orF3(pfx)) goto decode_failure;
+      d64 = getUChar(delta); delta++;
+      switch (d64) {
+         case 32: jk = Ijk_Sys_int32; break;
+         default: goto decode_failure;
+      }
+      guest_RIP_next_mustcheck = True;
+      guest_RIP_next_assumed = guest_RIP_bbstart + delta;
+      jmp_lit(jk, guest_RIP_next_assumed);
+      /* It's important that all ArchRegs carry their up-to-date value
+         at this point.  So we declare an end-of-block here, which
+         forces any TempRegs caching ArchRegs to be flushed. */
+      dres.whatNext = Dis_StopHere;
+      DIP("int $0x%02x\n", (UInt)d64);
+      break;
+   }
+
+   /* ------------------------ Jcond, byte offset --------- */
+
+   case 0xEB: /* Jb (jump, byte offset) */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      if (sz != 4) 
+         goto decode_failure; /* JRS added 2004 July 11 */
+      d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta); 
+      delta++;
+      if (resteerOkFn(callback_opaque,d64)) {
+         dres.whatNext   = Dis_ResteerU;
+         dres.continueAt = d64;
+      } else {
+         jmp_lit(Ijk_Boring,d64);
+         dres.whatNext = Dis_StopHere;
+      }
+      DIP("jmp-8 0x%llx\n", d64);
+      break;
+
+   case 0xE9: /* Jv (jump, 16/32 offset) */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      if (sz != 4) 
+         goto decode_failure; /* JRS added 2004 July 11 */
+      d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta); 
+      delta += sz;
+      if (resteerOkFn(callback_opaque,d64)) {
+         dres.whatNext   = Dis_ResteerU;
+         dres.continueAt = d64;
+      } else {
+         jmp_lit(Ijk_Boring,d64);
+         dres.whatNext = Dis_StopHere;
+      }
+      DIP("jmp 0x%llx\n", d64);
+      break;
+
+   case 0x70:
+   case 0x71:
+   case 0x72: /* JBb/JNAEb (jump below) */
+   case 0x73: /* JNBb/JAEb (jump not below) */
+   case 0x74: /* JZb/JEb (jump zero) */
+   case 0x75: /* JNZb/JNEb (jump not zero) */
+   case 0x76: /* JBEb/JNAb (jump below or equal) */
+   case 0x77: /* JNBEb/JAb (jump not below or equal) */
+   case 0x78: /* JSb (jump negative) */
+   case 0x79: /* JSb (jump not negative) */
+   case 0x7A: /* JP (jump parity even) */
+   case 0x7B: /* JNP/JPO (jump parity odd) */
+   case 0x7C: /* JLb/JNGEb (jump less) */
+   case 0x7D: /* JGEb/JNLb (jump greater or equal) */
+   case 0x7E: /* JLEb/JNGb (jump less or equal) */
+   case 0x7F: /* JGb/JNLEb (jump greater) */
+    { Long   jmpDelta;
+      HChar* comment  = "";
+      if (haveF2orF3(pfx)) goto decode_failure;
+      jmpDelta = getSDisp8(delta);
+      vassert(-128 <= jmpDelta && jmpDelta < 128);
+      d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
+      delta++;
+      if (resteerCisOk
+          && vex_control.guest_chase_cond
+          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
+          && jmpDelta < 0
+          && resteerOkFn( callback_opaque, d64) ) {
+         /* Speculation: assume this backward branch is taken.  So we
+            need to emit a side-exit to the insn following this one,
+            on the negation of the condition, and continue at the
+            branch target address (d64).  If we wind up back at the
+            first instruction of the trace, just stop; it's better to
+            let the IR loop unroller handle that case. */
+         stmt( IRStmt_Exit( 
+                  mk_amd64g_calculate_condition(
+                     (AMD64Condcode)(1 ^ (opc - 0x70))),
+                  Ijk_Boring,
+                  IRConst_U64(guest_RIP_bbstart+delta) ) );
+         dres.whatNext   = Dis_ResteerC;
+         dres.continueAt = d64;
+         comment = "(assumed taken)";
+      }
+      else
+      if (resteerCisOk
+          && vex_control.guest_chase_cond
+          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
+          && jmpDelta >= 0
+          && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
+         /* Speculation: assume this forward branch is not taken.  So
+            we need to emit a side-exit to d64 (the dest) and continue
+            disassembling at the insn immediately following this
+            one. */
+         stmt( IRStmt_Exit( 
+                  mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
+                  Ijk_Boring,
+                  IRConst_U64(d64) ) );
+         dres.whatNext   = Dis_ResteerC;
+         dres.continueAt = guest_RIP_bbstart+delta;
+         comment = "(assumed not taken)";
+      }
+      else {
+         /* Conservative default translation - end the block at this
+            point. */
+         jcc_01( (AMD64Condcode)(opc - 0x70), 
+                 guest_RIP_bbstart+delta,
+                 d64 );
+         dres.whatNext = Dis_StopHere;
+      }
+      DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), d64, comment);
+      break;
+    }
+
+   case 0xE3: 
+      /* JRCXZ or JECXZ, depending address size override. */
+      if (have66orF2orF3(pfx)) goto decode_failure;
+      d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta); 
+      delta++;
+      if (haveASO(pfx)) {
+         /* 32-bit */
+         stmt( IRStmt_Exit( binop(Iop_CmpEQ64, 
+                            unop(Iop_32Uto64, getIReg32(R_RCX)), 
+                            mkU64(0)),
+               Ijk_Boring,
+               IRConst_U64(d64)) 
+             );
+         DIP("jecxz 0x%llx\n", d64);
+      } else {
+         /* 64-bit */
+         stmt( IRStmt_Exit( binop(Iop_CmpEQ64, 
+                                  getIReg64(R_RCX), 
+                                  mkU64(0)),
+               Ijk_Boring,
+               IRConst_U64(d64)) 
+             );
+         DIP("jrcxz 0x%llx\n", d64);
+      }
+      break;
+
+   case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
+   case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
+   case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
+    { /* The docs say this uses rCX as a count depending on the
+         address size override, not the operand one.  Since we don't
+         handle address size overrides, I guess that means RCX. */
+      IRExpr* zbit  = NULL;
+      IRExpr* count = NULL;
+      IRExpr* cond  = NULL;
+      HChar*  xtra  = NULL;
+
+      if (have66orF2orF3(pfx) || haveASO(pfx)) goto decode_failure;
+      d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
+      delta++;
+      putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
+
+      count = getIReg64(R_RCX);
+      cond = binop(Iop_CmpNE64, count, mkU64(0));
+      switch (opc) {
+         case 0xE2: 
+            xtra = ""; 
+            break;
+         case 0xE1: 
+            xtra = "e"; 
+            zbit = mk_amd64g_calculate_condition( AMD64CondZ );
+      cond = mkAnd1(cond, zbit);
+            break;
+         case 0xE0: 
+            xtra = "ne";
+            zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
+      cond = mkAnd1(cond, zbit);
+            break;
+         default:
+	    vassert(0);
+      }
+      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64)) );
+
+      DIP("loop%s 0x%llx\n", xtra, d64);
+      break;
+    }
+
+   /* ------------------------ IMUL ----------------------- */
+
+   case 0x69: /* IMUL Iv, Ev, Gv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
+      break;
+   case 0x6B: /* IMUL Ib, Ev, Gv */
+      delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
+      break;
+
+   /* ------------------------ MOV ------------------------ */
+
+   case 0x88: /* MOV Gb,Eb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_mov_G_E(vbi, pfx, 1, delta);
+      break;
+
+   case 0x89: /* MOV Gv,Ev */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_mov_G_E(vbi, pfx, sz, delta);
+      break;
+
+   case 0x8A: /* MOV Eb,Gb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_mov_E_G(vbi, pfx, 1, delta);
+      break;
+ 
+   case 0x8B: /* MOV Ev,Gv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_mov_E_G(vbi, pfx, sz, delta);
+      break;
+ 
+   case 0x8D: /* LEA M,Gv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      if (sz != 4 && sz != 8)
+         goto decode_failure;
+      modrm = getUChar(delta);
+      if (epartIsReg(modrm)) 
+         goto decode_failure;
+      /* NOTE!  this is the one place where a segment override prefix
+         has no effect on the address calculation.  Therefore we clear
+         any segment override bits in pfx. */
+      addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
+      delta += alen;
+      /* This is a hack.  But it isn't clear that really doing the
+         calculation at 32 bits is really worth it.  Hence for leal,
+         do the full 64-bit calculation and then truncate it. */
+      putIRegG( sz, pfx, modrm, 
+                         sz == 4
+                            ? unop(Iop_64to32, mkexpr(addr))
+                            : mkexpr(addr)
+              );
+      DIP("lea%c %s, %s\n", nameISize(sz), dis_buf, 
+                            nameIRegG(sz,pfx,modrm));
+      break;
+
+//..    case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
+//..       delta = dis_mov_Sw_Ew(sorb, sz, delta);
+//..       break;
+//.. 
+//..    case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
+//..       delta = dis_mov_Ew_Sw(sorb, delta);
+//..       break;
+ 
+   case 0xA0: /* MOV Ob,AL */
+      if (have66orF2orF3(pfx)) goto decode_failure;
+      sz = 1;
+      /* Fall through ... */
+   case 0xA1: /* MOV Ov,eAX */
+      if (sz != 8 && sz != 4 && sz != 2 && sz != 1) 
+         goto decode_failure;
+      d64 = getDisp64(delta); 
+      delta += 8;
+      ty = szToITy(sz);
+      addr = newTemp(Ity_I64);
+      assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
+      putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
+      DIP("mov%c %s0x%llx, %s\n", nameISize(sz), 
+                                  segRegTxt(pfx), d64,
+                                  nameIRegRAX(sz));
+      break;
+
+   case 0xA2: /* MOV AL,Ob */
+      if (have66orF2orF3(pfx)) goto decode_failure;
+      sz = 1;
+      /* Fall through ... */
+   case 0xA3: /* MOV eAX,Ov */
+      if (sz != 8 && sz != 4 && sz != 2 && sz != 1) 
+         goto decode_failure;
+      d64 = getDisp64(delta); 
+      delta += 8;
+      ty = szToITy(sz);
+      addr = newTemp(Ity_I64);
+      assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
+      storeLE( mkexpr(addr), getIRegRAX(sz) );
+      DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
+                                  segRegTxt(pfx), d64);
+      break;
+
+   /* XXXX be careful here with moves to AH/BH/CH/DH */
+   case 0xB0: /* MOV imm,AL */
+   case 0xB1: /* MOV imm,CL */
+   case 0xB2: /* MOV imm,DL */
+   case 0xB3: /* MOV imm,BL */
+   case 0xB4: /* MOV imm,AH */
+   case 0xB5: /* MOV imm,CH */
+   case 0xB6: /* MOV imm,DH */
+   case 0xB7: /* MOV imm,BH */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      d64 = getUChar(delta); 
+      delta += 1;
+      putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
+      DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
+      break;
+
+   case 0xB8: /* MOV imm,eAX */
+   case 0xB9: /* MOV imm,eCX */
+   case 0xBA: /* MOV imm,eDX */
+   case 0xBB: /* MOV imm,eBX */
+   case 0xBC: /* MOV imm,eSP */
+   case 0xBD: /* MOV imm,eBP */
+   case 0xBE: /* MOV imm,eSI */
+   case 0xBF: /* MOV imm,eDI */
+      /* This is the one-and-only place where 64-bit literals are
+         allowed in the instruction stream. */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      if (sz == 8) {
+         d64 = getDisp64(delta);
+         delta += 8;
+         putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
+         DIP("movabsq $%lld,%s\n", (Long)d64, 
+                                   nameIRegRexB(8,pfx,opc-0xB8));
+      } else {
+         d64 = getSDisp(imin(4,sz),delta);
+         delta += imin(4,sz);
+         putIRegRexB(sz, pfx, opc-0xB8, 
+                         mkU(szToITy(sz), d64 & mkSizeMask(sz)));
+         DIP("mov%c $%lld,%s\n", nameISize(sz), 
+                                 (Long)d64, 
+                                 nameIRegRexB(sz,pfx,opc-0xB8));
+      }
+      break;
+
+   case 0xC6: /* MOV Ib,Eb */
+      sz = 1;
+      goto do_Mov_I_E;
+   case 0xC7: /* MOV Iv,Ev */
+      goto do_Mov_I_E;
+
+   do_Mov_I_E:
+      if (haveF2orF3(pfx)) goto decode_failure;
+      modrm = getUChar(delta);
+      if (epartIsReg(modrm)) {
+         delta++; /* mod/rm byte */
+         d64 = getSDisp(imin(4,sz),delta); 
+         delta += imin(4,sz);
+         putIRegE(sz, pfx, modrm, 
+                      mkU(szToITy(sz), d64 & mkSizeMask(sz)));
+         DIP("mov%c $%lld, %s\n", nameISize(sz), 
+                                  (Long)d64, 
+                                  nameIRegE(sz,pfx,modrm));
+      } else {
+         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 
+                           /*xtra*/imin(4,sz) );
+         delta += alen;
+         d64 = getSDisp(imin(4,sz),delta);
+         delta += imin(4,sz);
+         storeLE(mkexpr(addr), 
+                 mkU(szToITy(sz), d64 & mkSizeMask(sz)));
+         DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
+      }
+      break;
+
+   /* ------------------------ MOVx ------------------------ */
+
+   case 0x63: /* MOVSX */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      if (haveREX(pfx) && 1==getRexW(pfx)) {
+         vassert(sz == 8);
+         /* movsx r/m32 to r64 */
+         modrm = getUChar(delta);
+         if (epartIsReg(modrm)) {
+            delta++;
+            putIRegG(8, pfx, modrm, 
+                             unop(Iop_32Sto64, 
+                                  getIRegE(4, pfx, modrm)));
+            DIP("movslq %s,%s\n",
+                nameIRegE(4, pfx, modrm),
+                nameIRegG(8, pfx, modrm));
+            break;
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            delta += alen;
+            putIRegG(8, pfx, modrm, 
+                             unop(Iop_32Sto64, 
+                                  loadLE(Ity_I32, mkexpr(addr))));
+            DIP("movslq %s,%s\n", dis_buf, 
+                nameIRegG(8, pfx, modrm));
+            break;
+         }
+      } else {
+         goto decode_failure;
+      }
+
+   /* ------------------------ opl imm, A ----------------- */
+
+   case 0x04: /* ADD Ib, AL */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
+      break;
+   case 0x05: /* ADD Iv, eAX */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
+      break;
+
+   case 0x0C: /* OR Ib, AL */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
+      break;
+   case 0x0D: /* OR Iv, eAX */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
+      break;
+
+   case 0x14: /* ADC Ib, AL */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
+      break;
+   case 0x15: /* ADC Iv, eAX */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
+      break;
+
+   case 0x1C: /* SBB Ib, AL */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
+      break;
+   case 0x1D: /* SBB Iv, eAX */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
+      break;
+
+   case 0x24: /* AND Ib, AL */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
+      break;
+   case 0x25: /* AND Iv, eAX */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
+      break;
+
+   case 0x2C: /* SUB Ib, AL */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
+      break;
+   case 0x2D: /* SUB Iv, eAX */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
+      break;
+
+   case 0x34: /* XOR Ib, AL */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
+      break;
+   case 0x35: /* XOR Iv, eAX */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
+      break;
+
+   case 0x3C: /* CMP Ib, AL */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
+      break;
+   case 0x3D: /* CMP Iv, eAX */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
+      break;
+
+   case 0xA8: /* TEST Ib, AL */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
+      break;
+   case 0xA9: /* TEST Iv, eAX */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
+      break;
+
+   /* ------------------------ opl Ev, Gv ----------------- */
+ 
+   case 0x02: /* ADD Eb,Gb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
+      break;
+   case 0x03: /* ADD Ev,Gv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
+      break;
+
+   case 0x0A: /* OR Eb,Gb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
+      break;
+   case 0x0B: /* OR Ev,Gv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
+      break;
+
+   case 0x12: /* ADC Eb,Gb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
+      break;
+   case 0x13: /* ADC Ev,Gv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
+      break;
+
+   case 0x1A: /* SBB Eb,Gb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
+      break;
+   case 0x1B: /* SBB Ev,Gv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
+      break;
+
+   case 0x22: /* AND Eb,Gb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
+      break;
+   case 0x23: /* AND Ev,Gv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
+      break;
+
+   case 0x2A: /* SUB Eb,Gb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
+      break;
+   case 0x2B: /* SUB Ev,Gv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
+      break;
+
+   case 0x32: /* XOR Eb,Gb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
+      break;
+   case 0x33: /* XOR Ev,Gv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
+      break;
+
+   case 0x3A: /* CMP Eb,Gb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
+      break;
+   case 0x3B: /* CMP Ev,Gv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
+      break;
+
+   case 0x84: /* TEST Eb,Gb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, 1, delta, "test" );
+      break;
+   case 0x85: /* TEST Ev,Gv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, sz, delta, "test" );
+      break;
+
+   /* ------------------------ opl Gv, Ev ----------------- */
+
+   case 0x00: /* ADD Gb,Eb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
+      break;
+   case 0x01: /* ADD Gv,Ev */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
+      break;
+
+   case 0x08: /* OR Gb,Eb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
+      break;
+   case 0x09: /* OR Gv,Ev */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
+      break;
+
+   case 0x10: /* ADC Gb,Eb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
+      break;
+   case 0x11: /* ADC Gv,Ev */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
+      break;
+
+   case 0x18: /* SBB Gb,Eb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
+      break;
+   case 0x19: /* SBB Gv,Ev */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
+      break;
+
+   case 0x20: /* AND Gb,Eb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
+      break;
+   case 0x21: /* AND Gv,Ev */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
+      break;
+
+   case 0x28: /* SUB Gb,Eb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
+      break;
+   case 0x29: /* SUB Gv,Ev */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
+      break;
+
+   case 0x30: /* XOR Gb,Eb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
+      break;
+   case 0x31: /* XOR Gv,Ev */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
+      break;
+
+   case 0x38: /* CMP Gb,Eb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
+      break;
+   case 0x39: /* CMP Gv,Ev */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
+      break;
+
+   /* ------------------------ POP ------------------------ */
+
+   case 0x58: /* POP eAX */
+   case 0x59: /* POP eCX */
+   case 0x5A: /* POP eDX */
+   case 0x5B: /* POP eBX */
+   case 0x5D: /* POP eBP */
+   case 0x5E: /* POP eSI */
+   case 0x5F: /* POP eDI */
+   case 0x5C: /* POP eSP */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      vassert(sz == 2 || sz == 4 || sz == 8);
+      if (sz == 4)
+         sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
+      t1 = newTemp(szToITy(sz)); 
+      t2 = newTemp(Ity_I64);
+      assign(t2, getIReg64(R_RSP));
+      assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
+      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
+      putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
+      DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
+      break;
+
+   case 0x9D: /* POPF */
+      /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
+         So sz==4 actually means sz==8. */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      vassert(sz == 2 || sz == 4);
+      if (sz == 4) sz = 8;
+      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
+      t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
+      assign(t2, getIReg64(R_RSP));
+      assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
+      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
+      /* t1 is the flag word.  Mask out everything except OSZACP and 
+         set the flags thunk to AMD64G_CC_OP_COPY. */
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP1, 
+                        binop(Iop_And64,
+                              mkexpr(t1), 
+                              mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P 
+                                     | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z 
+                                     | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
+                             )
+                       )
+          );
+
+      /* Also need to set the D flag, which is held in bit 10 of t1.
+         If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
+      stmt( IRStmt_Put( 
+               OFFB_DFLAG,
+               IRExpr_Mux0X( 
+                  unop(Iop_32to8,
+                  unop(Iop_64to32,
+                       binop(Iop_And64, 
+                             binop(Iop_Shr64, mkexpr(t1), mkU8(10)), 
+                             mkU64(1)))),
+                  mkU64(1), 
+                  mkU64(0xFFFFFFFFFFFFFFFFULL))) 
+          );
+
+      /* And set the ID flag */
+      stmt( IRStmt_Put( 
+               OFFB_IDFLAG,
+               IRExpr_Mux0X( 
+                  unop(Iop_32to8,
+                  unop(Iop_64to32,
+                       binop(Iop_And64, 
+                             binop(Iop_Shr64, mkexpr(t1), mkU8(21)), 
+                             mkU64(1)))),
+                  mkU64(0), 
+                  mkU64(1))) 
+          );
+
+      /* And set the AC flag too */
+      stmt( IRStmt_Put( 
+               OFFB_ACFLAG,
+               IRExpr_Mux0X( 
+                  unop(Iop_32to8,
+                  unop(Iop_64to32,
+                       binop(Iop_And64, 
+                             binop(Iop_Shr64, mkexpr(t1), mkU8(18)), 
+                             mkU64(1)))),
+                  mkU64(0), 
+                  mkU64(1))) 
+          );
+
+      DIP("popf%c\n", nameISize(sz));
+      break;
+
+//..    case 0x61: /* POPA */
+//..       /* This is almost certainly wrong for sz==2.  So ... */
+//..       if (sz != 4) goto decode_failure;
+//.. 
+//..       /* t5 is the old %ESP value. */
+//..       t5 = newTemp(Ity_I32);
+//..       assign( t5, getIReg(4, R_ESP) );
+//.. 
+//..       /* Reload all the registers, except %esp. */
+//..       putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
+//..       putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
+//..       putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
+//..       putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
+//..       /* ignore saved %ESP */
+//..       putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
+//..       putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
+//..       putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
+//.. 
+//..       /* and move %ESP back up */
+//..       putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
+//.. 
+//..       DIP("pusha%c\n", nameISize(sz));
+//..       break;
+
+   case 0x8F: { /* POPQ m64 / POPW m16 */
+      Int   len;
+      UChar rm;
+      /* There is no encoding for 32-bit pop in 64-bit mode.
+         So sz==4 actually means sz==8. */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      vassert(sz == 2 || sz == 4);
+      if (sz == 4) sz = 8;
+      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
+
+      rm = getUChar(delta);
+
+      /* make sure this instruction is correct POP */
+      if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
+         goto decode_failure;
+      /* and has correct size */
+      vassert(sz == 8);      
+       
+      t1 = newTemp(Ity_I64);
+      t3 = newTemp(Ity_I64);
+      assign( t1, getIReg64(R_RSP) );
+      assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
+       
+      /* Increase RSP; must be done before the STORE.  Intel manual
+         says: If the RSP register is used as a base register for
+         addressing a destination operand in memory, the POP
+         instruction computes the effective address of the operand
+         after it increments the RSP register.  */
+      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
+
+      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
+      storeLE( mkexpr(addr), mkexpr(t3) );
+
+      DIP("popl %s\n", dis_buf);
+
+      delta += len;
+      break;
+   }
+
+//.. //--    case 0x1F: /* POP %DS */
+//.. //--       dis_pop_segreg( cb, R_DS, sz ); break;
+//.. //--    case 0x07: /* POP %ES */
+//.. //--       dis_pop_segreg( cb, R_ES, sz ); break;
+//.. //--    case 0x17: /* POP %SS */
+//.. //--       dis_pop_segreg( cb, R_SS, sz ); break;
+
+   /* ------------------------ PUSH ----------------------- */
+
+   case 0x50: /* PUSH eAX */
+   case 0x51: /* PUSH eCX */
+   case 0x52: /* PUSH eDX */
+   case 0x53: /* PUSH eBX */
+   case 0x55: /* PUSH eBP */
+   case 0x56: /* PUSH eSI */
+   case 0x57: /* PUSH eDI */
+   case 0x54: /* PUSH eSP */
+      /* This is the Right Way, in that the value to be pushed is
+         established before %rsp is changed, so that pushq %rsp
+         correctly pushes the old value. */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      vassert(sz == 2 || sz == 4 || sz == 8);
+      if (sz == 4)
+         sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
+      ty = sz==2 ? Ity_I16 : Ity_I64;
+      t1 = newTemp(ty); 
+      t2 = newTemp(Ity_I64);
+      assign(t1, getIRegRexB(sz, pfx, opc-0x50));
+      assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
+      putIReg64(R_RSP, mkexpr(t2) );
+      storeLE(mkexpr(t2),mkexpr(t1));
+      DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
+      break;
+
+   case 0x68: /* PUSH Iv */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
+      if (sz == 4) sz = 8;
+      d64 = getSDisp(imin(4,sz),delta); 
+      delta += imin(4,sz);
+      goto do_push_I;
+   case 0x6A: /* PUSH Ib, sign-extended to sz */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
+      if (sz == 4) sz = 8;
+      d64 = getSDisp8(delta); delta += 1;
+      goto do_push_I;
+   do_push_I:
+      ty = szToITy(sz);
+      t1 = newTemp(Ity_I64);
+      t2 = newTemp(ty);
+      assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
+      putIReg64(R_RSP, mkexpr(t1) );
+      /* stop mkU16 asserting if d32 is a negative 16-bit number
+         (bug #132813) */
+      if (ty == Ity_I16)
+         d64 &= 0xFFFF;
+      storeLE( mkexpr(t1), mkU(ty,d64) );
+      DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
+      break;
+
+   case 0x9C: /* PUSHF */ {
+      /* Note.  There is no encoding for a 32-bit pushf in 64-bit
+         mode.  So sz==4 actually means sz==8. */
+      /* 24 July 06: has also been seen with a redundant REX prefix,
+         so must also allow sz==8. */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      vassert(sz == 2 || sz == 4 || sz == 8);
+      if (sz == 4) sz = 8;
+      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
+
+      t1 = newTemp(Ity_I64);
+      assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
+      putIReg64(R_RSP, mkexpr(t1) );
+
+      t2 = newTemp(Ity_I64);
+      assign( t2, mk_amd64g_calculate_rflags_all() );
+
+      /* Patch in the D flag.  This can simply be a copy of bit 10 of
+         baseBlock[OFFB_DFLAG]. */
+      t3 = newTemp(Ity_I64);
+      assign( t3, binop(Iop_Or64,
+                        mkexpr(t2),
+                        binop(Iop_And64,
+                              IRExpr_Get(OFFB_DFLAG,Ity_I64),
+                              mkU64(1<<10))) 
+            );
+
+      /* And patch in the ID flag. */
+      t4 = newTemp(Ity_I64);
+      assign( t4, binop(Iop_Or64,
+                        mkexpr(t3),
+                        binop(Iop_And64,
+                              binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64), 
+                                               mkU8(21)),
+                              mkU64(1<<21)))
+            );
+
+      /* And patch in the AC flag too. */
+      t5 = newTemp(Ity_I64);
+      assign( t5, binop(Iop_Or64,
+                        mkexpr(t4),
+                        binop(Iop_And64,
+                              binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64), 
+                                               mkU8(18)),
+                              mkU64(1<<18)))
+            );
+
+      /* if sz==2, the stored value needs to be narrowed. */
+      if (sz == 2)
+        storeLE( mkexpr(t1), unop(Iop_32to16,
+                             unop(Iop_64to32,mkexpr(t5))) );
+      else 
+        storeLE( mkexpr(t1), mkexpr(t5) );
+
+      DIP("pushf%c\n", nameISize(sz));
+      break;
+   }
+
+//..    case 0x60: /* PUSHA */
+//..       /* This is almost certainly wrong for sz==2.  So ... */
+//..       if (sz != 4) goto decode_failure;
+//.. 
+//..       /* This is the Right Way, in that the value to be pushed is
+//..          established before %esp is changed, so that pusha
+//..          correctly pushes the old %esp value.  New value of %esp is
+//..          pushed at start. */
+//..       /* t0 is the %ESP value we're going to push. */
+//..       t0 = newTemp(Ity_I32);
+//..       assign( t0, getIReg(4, R_ESP) );
+//.. 
+//..       /* t5 will be the new %ESP value. */
+//..       t5 = newTemp(Ity_I32);
+//..       assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
+//.. 
+//..       /* Update guest state before prodding memory. */
+//..       putIReg(4, R_ESP, mkexpr(t5));
+//.. 
+//..       /* Dump all the registers. */
+//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
+//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
+//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
+//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
+//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
+//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
+//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
+//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
+//.. 
+//..       DIP("pusha%c\n", nameISize(sz));
+//..       break;
+//.. 
+//.. 
+//.. //--    case 0x0E: /* PUSH %CS */
+//.. //--       dis_push_segreg( cb, R_CS, sz ); break;
+//.. //--    case 0x1E: /* PUSH %DS */
+//.. //--       dis_push_segreg( cb, R_DS, sz ); break;
+//.. //--    case 0x06: /* PUSH %ES */
+//.. //--       dis_push_segreg( cb, R_ES, sz ); break;
+//.. //--    case 0x16: /* PUSH %SS */
+//.. //--       dis_push_segreg( cb, R_SS, sz ); break;
+//.. 
+//..    /* ------------------------ SCAS et al ----------------- */
+//.. 
+//..    case 0xA4: /* MOVS, no REP prefix */
+//..    case 0xA5: 
+//..       dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
+//..       break;
+//.. 
+//..   case 0xA6: /* CMPSb, no REP prefix */
+//.. //--    case 0xA7:
+//..      dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
+//..      break;
+//.. //-- 
+//.. //--    
+    case 0xAC: /* LODS, no REP prefix */
+    case 0xAD:
+       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
+       break;
+//.. 
+//..    case 0xAE: /* SCAS, no REP prefix */
+//..    case 0xAF:
+//..       dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
+//..       break;
+
+
+   case 0xFC: /* CLD */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
+      DIP("cld\n");
+      break;
+
+   case 0xFD: /* STD */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
+      DIP("std\n");
+      break;
+
+   case 0xF8: /* CLC */
+   case 0xF9: /* STC */
+   case 0xF5: /* CMC */
+      t0 = newTemp(Ity_I64);
+      t1 = newTemp(Ity_I64);
+      assign( t0, mk_amd64g_calculate_rflags_all() );
+      switch (opc) {
+         case 0xF8: 
+            assign( t1, binop(Iop_And64, mkexpr(t0), 
+                                         mkU64(~AMD64G_CC_MASK_C)));
+            DIP("clc\n");
+            break;
+         case 0xF9: 
+            assign( t1, binop(Iop_Or64, mkexpr(t0), 
+                                        mkU64(AMD64G_CC_MASK_C)));
+            DIP("stc\n");
+            break;
+         case 0xF5: 
+            assign( t1, binop(Iop_Xor64, mkexpr(t0), 
+                                         mkU64(AMD64G_CC_MASK_C)));
+            DIP("cmc\n");
+            break;
+         default: 
+            vpanic("disInstr(x64)(clc/stc/cmc)");
+      }
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
+      /* Set NDEP even though it isn't used.  This makes redundant-PUT
+         elimination of previous stores to this field work better. */
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+      break;
+
+//..    /* REPNE prefix insn */
+//..    case 0xF2: { 
+//..       Addr32 eip_orig = guest_eip_bbstart + delta - 1;
+//..       vassert(sorb == 0);
+//..       abyte = getUChar(delta); delta++;
+//.. 
+//..       if (abyte == 0x66) { sz = 2; abyte = getUChar(delta); delta++; }
+//..       whatNext = Dis_StopHere;         
+//.. 
+//..       switch (abyte) {
+//..       /* According to the Intel manual, "repne movs" should never occur, but
+//..        * in practice it has happened, so allow for it here... */
+//..       case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
+//..         goto decode_failure;
+//.. //--       case 0xA5: 
+//..         //         dis_REP_op ( CondNZ, dis_MOVS, sz, eip_orig,
+//..         //                              guest_eip_bbstart+delta, "repne movs" );
+//..         //         break;
+//.. //-- 
+//.. //--       case 0xA6: sz = 1;   /* REPNE CMPS<sz> */
+//.. //--       case 0xA7:
+//.. //--          dis_REP_op ( cb, CondNZ, dis_CMPS, sz, eip_orig, eip, "repne cmps" );
+//.. //--          break;
+//.. //-- 
+//..       case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
+//..       case 0xAF:
+//..          dis_REP_op ( X86CondNZ, dis_SCAS, sz, eip_orig,
+//..                                  guest_eip_bbstart+delta, "repne scas" );
+//..          break;
+//.. 
+//..       default:
+//..          goto decode_failure;
+//..       }
+//..       break;
+//..    }
+
+   /* ------ AE: SCAS variants ------ */
+   case 0xAE:
+   case 0xAF:
+      /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
+      if (haveASO(pfx)) 
+         goto decode_failure;
+      if (haveF2(pfx) && !haveF3(pfx)) {
+         if (opc == 0xAE)
+            sz = 1;
+         dis_REP_op ( AMD64CondNZ, dis_SCAS, sz, 
+                      guest_RIP_curr_instr,
+                      guest_RIP_bbstart+delta, "repne scas", pfx );
+         dres.whatNext = Dis_StopHere;
+         break;
+      }
+      /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
+      if (haveASO(pfx)) 
+         goto decode_failure;
+      if (!haveF2(pfx) && haveF3(pfx)) {
+         if (opc == 0xAE)
+            sz = 1;
+         dis_REP_op ( AMD64CondZ, dis_SCAS, sz, 
+                      guest_RIP_curr_instr,
+                      guest_RIP_bbstart+delta, "repe scas", pfx );
+         dres.whatNext = Dis_StopHere;
+         break;
+      }
+      /* AE/AF: scasb/scas{w,l,q} */
+      if (!haveF2(pfx) && !haveF3(pfx)) {
+         if (opc == 0xAE)
+            sz = 1;
+         dis_string_op( dis_SCAS, sz, "scas", pfx );
+         break;
+      }
+      goto decode_failure;
+
+   /* ------ A6, A7: CMPS variants ------ */
+   case 0xA6:
+   case 0xA7:
+      /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
+      if (haveASO(pfx)) 
+         goto decode_failure;
+      if (haveF3(pfx) && !haveF2(pfx)) {
+         if (opc == 0xA6)
+            sz = 1;
+         dis_REP_op ( AMD64CondZ, dis_CMPS, sz, 
+                      guest_RIP_curr_instr,
+                      guest_RIP_bbstart+delta, "repe cmps", pfx );
+         dres.whatNext = Dis_StopHere;
+         break;
+      }
+      goto decode_failure;
+
+   /* ------ AA, AB: STOS variants ------ */
+   case 0xAA:
+   case 0xAB:
+      /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
+      if (haveASO(pfx)) 
+         goto decode_failure;
+      if (haveF3(pfx) && !haveF2(pfx)) {
+         if (opc == 0xAA)
+            sz = 1;
+         dis_REP_op ( AMD64CondAlways, dis_STOS, sz,
+                      guest_RIP_curr_instr,
+                      guest_RIP_bbstart+delta, "rep stos", pfx );
+        dres.whatNext = Dis_StopHere;
+        break;
+      }
+      /* AA/AB: stosb/stos{w,l,q} */
+      if (!haveF3(pfx) && !haveF2(pfx)) {
+         if (opc == 0xAA)
+            sz = 1;
+         dis_string_op( dis_STOS, sz, "stos", pfx );
+         break;
+      }
+      goto decode_failure;
+
+   /* ------ A4, A5: MOVS variants ------ */
+   case 0xA4:
+   case 0xA5:
+      /* F3 A4: rep movsb */
+      if (haveASO(pfx)) 
+         goto decode_failure;
+      if (haveF3(pfx) && !haveF2(pfx)) {
+         if (opc == 0xA4)
+            sz = 1;
+         dis_REP_op ( AMD64CondAlways, dis_MOVS, sz,
+                      guest_RIP_curr_instr,
+                      guest_RIP_bbstart+delta, "rep movs", pfx );
+        dres.whatNext = Dis_StopHere;
+        break;
+      }
+      /* A4: movsb */
+      if (!haveF3(pfx) && !haveF2(pfx)) {
+         if (opc == 0xA4)
+            sz = 1;
+         dis_string_op( dis_MOVS, sz, "movs", pfx );
+         break;
+      }
+      goto decode_failure;
+
+
+   /* ------------------------ XCHG ----------------------- */
+
+   /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
+      prefix.  Therefore, surround it with a IRStmt_MBE(Imbe_BusLock)
+      and IRStmt_MBE(Imbe_BusUnlock) pair.  But be careful; if it is
+      used with an explicit LOCK prefix, we don't want to end up with
+      two IRStmt_MBE(Imbe_BusLock)s -- one made here and one made by
+      the generic LOCK logic at the top of disInstr. */
+   case 0x86: /* XCHG Gb,Eb */
+      sz = 1;
+      /* Fall through ... */
+   case 0x87: /* XCHG Gv,Ev */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      modrm = getUChar(delta);
+      ty = szToITy(sz);
+      t1 = newTemp(ty); t2 = newTemp(ty);
+      if (epartIsReg(modrm)) {
+         assign(t1, getIRegE(sz, pfx, modrm));
+         assign(t2, getIRegG(sz, pfx, modrm));
+         putIRegG(sz, pfx, modrm, mkexpr(t1));
+         putIRegE(sz, pfx, modrm, mkexpr(t2));
+         delta++;
+         DIP("xchg%c %s, %s\n", 
+             nameISize(sz), nameIRegG(sz, pfx, modrm), 
+                            nameIRegE(sz, pfx, modrm));
+      } else {
+         *expect_CAS = True;
+         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+         assign( t1, loadLE(ty, mkexpr(addr)) );
+         assign( t2, getIRegG(sz, pfx, modrm) );
+         casLE( mkexpr(addr),
+                mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
+         putIRegG( sz, pfx, modrm, mkexpr(t1) );
+         delta += alen;
+         DIP("xchg%c %s, %s\n", nameISize(sz), 
+                                nameIRegG(sz, pfx, modrm), dis_buf);
+      }
+      break;
+
+   case 0x90: /* XCHG eAX,eAX */
+      /* detect and handle F3 90 (rep nop) specially */
+      if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
+         DIP("rep nop (P4 pause)\n");
+         /* "observe" the hint.  The Vex client needs to be careful not
+            to cause very long delays as a result, though. */
+         jmp_lit(Ijk_Yield, guest_RIP_bbstart+delta);
+         dres.whatNext = Dis_StopHere;
+         break;
+      }
+      /* detect and handle NOPs specially */
+      if (/* F2/F3 probably change meaning completely */
+          !haveF2orF3(pfx)
+          /* If REX.B is 1, we're not exchanging rAX with itself */
+          && getRexB(pfx)==0 ) {
+         DIP("nop\n");
+         break;
+      }
+      /* else fall through to normal case. */
+   case 0x91: /* XCHG rAX,rCX */
+   case 0x92: /* XCHG rAX,rDX */
+   case 0x93: /* XCHG rAX,rBX */
+   case 0x94: /* XCHG rAX,rSP */
+   case 0x95: /* XCHG rAX,rBP */
+   case 0x96: /* XCHG rAX,rSI */
+   case 0x97: /* XCHG rAX,rDI */
+
+      /* guard against mutancy */
+      if (haveF2orF3(pfx)) goto decode_failure;
+
+      /* sz == 2 could legitimately happen, but we don't handle it yet */
+      if (sz == 2) goto decode_failure; /* awaiting test case */
+
+      codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
+      break;
+
+//.. //--    /* ------------------------ XLAT ----------------------- */
+//.. //-- 
+//.. //--    case 0xD7: /* XLAT */
+//.. //--       t1 = newTemp(cb); t2 = newTemp(cb);
+//.. //--       uInstr2(cb, GET, sz, ArchReg, R_EBX, TempReg, t1); /* get eBX */
+//.. //--       handleAddrOverrides( cb, sorb, t1 );               /* make t1 DS:eBX */
+//.. //--       uInstr2(cb, GET, 1, ArchReg, R_AL, TempReg, t2); /* get AL */
+//.. //--       /* Widen %AL to 32 bits, so it's all defined when we add it. */
+//.. //--       uInstr1(cb, WIDEN, 4, TempReg, t2);
+//.. //--       uWiden(cb, 1, False);
+//.. //--       uInstr2(cb, ADD, sz, TempReg, t2, TempReg, t1);  /* add AL to eBX */
+//.. //--       uInstr2(cb, LOAD, 1, TempReg, t1,  TempReg, t2); /* get byte at t1 into t2 */
+//.. //--       uInstr2(cb, PUT, 1, TempReg, t2, ArchReg, R_AL); /* put byte into AL */
+//.. //-- 
+//.. //--       DIP("xlat%c [ebx]\n", nameISize(sz));
+//.. //--       break;
+
+   /* ------------------------ IN / OUT ----------------------- */
+ 
+   case 0xE4: /* IN imm8, AL */
+      sz = 1; 
+      t1 = newTemp(Ity_I64);
+      abyte = getUChar(delta); delta++;
+      assign(t1, mkU64( abyte & 0xFF ));
+      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
+      goto do_IN;
+   case 0xE5: /* IN imm8, eAX */
+      if (!(sz == 2 || sz == 4)) goto decode_failure;
+      t1 = newTemp(Ity_I64);
+      abyte = getUChar(delta); delta++;
+      assign(t1, mkU64( abyte & 0xFF ));
+      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
+      goto do_IN;
+   case 0xEC: /* IN %DX, AL */
+      sz = 1; 
+      t1 = newTemp(Ity_I64);
+      assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
+      DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2), 
+                                         nameIRegRAX(sz));
+      goto do_IN;
+   case 0xED: /* IN %DX, eAX */
+      if (!(sz == 2 || sz == 4)) goto decode_failure;
+      t1 = newTemp(Ity_I64);
+      assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
+      DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2), 
+                                         nameIRegRAX(sz));
+      goto do_IN;
+   do_IN: {
+      /* At this point, sz indicates the width, and t1 is a 64-bit
+         value giving port number. */
+      IRDirty* d;
+      if (haveF2orF3(pfx)) goto decode_failure;
+      vassert(sz == 1 || sz == 2 || sz == 4);
+      ty = szToITy(sz);
+      t2 = newTemp(Ity_I64);
+      d = unsafeIRDirty_1_N( 
+             t2,
+             0/*regparms*/, 
+             "amd64g_dirtyhelper_IN", 
+             &amd64g_dirtyhelper_IN,
+             mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
+          );
+      /* do the call, dumping the result in t2. */
+      stmt( IRStmt_Dirty(d) );
+      putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
+      break;
+   }
+
+   case 0xE6: /* OUT AL, imm8 */
+      sz = 1;
+      t1 = newTemp(Ity_I64);
+      abyte = getUChar(delta); delta++;
+      assign( t1, mkU64( abyte & 0xFF ) );
+      DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
+      goto do_OUT;
+   case 0xE7: /* OUT eAX, imm8 */
+      if (!(sz == 2 || sz == 4)) goto decode_failure;
+      t1 = newTemp(Ity_I64);
+      abyte = getUChar(delta); delta++;
+      assign( t1, mkU64( abyte & 0xFF ) );
+      DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
+      goto do_OUT;
+   case 0xEE: /* OUT AL, %DX */
+      sz = 1;
+      t1 = newTemp(Ity_I64);
+      assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
+      DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
+                                          nameIRegRDX(2));
+      goto do_OUT;
+   case 0xEF: /* OUT eAX, %DX */
+      if (!(sz == 2 || sz == 4)) goto decode_failure;
+      t1 = newTemp(Ity_I64);
+      assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
+      DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
+                                          nameIRegRDX(2));
+      goto do_OUT;
+   do_OUT: {
+      /* At this point, sz indicates the width, and t1 is a 64-bit
+         value giving port number. */
+      IRDirty* d;
+      if (haveF2orF3(pfx)) goto decode_failure;
+      vassert(sz == 1 || sz == 2 || sz == 4);
+      ty = szToITy(sz);
+      d = unsafeIRDirty_0_N( 
+             0/*regparms*/, 
+             "amd64g_dirtyhelper_OUT", 
+             &amd64g_dirtyhelper_OUT,
+             mkIRExprVec_3( mkexpr(t1),
+                            widenUto64( getIRegRAX(sz) ), 
+                            mkU64(sz) )
+          );
+      stmt( IRStmt_Dirty(d) );
+      break;
+   }
+
+   /* ------------------------ (Grp1 extensions) ---------- */
+
+   case 0x80: /* Grp1 Ib,Eb */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      modrm = getUChar(delta);
+      am_sz = lengthAMode(pfx,delta);
+      sz    = 1;
+      d_sz  = 1;
+      d64   = getSDisp8(delta + am_sz);
+      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
+      break;
+
+   case 0x81: /* Grp1 Iv,Ev */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      modrm = getUChar(delta);
+      am_sz = lengthAMode(pfx,delta);
+      d_sz  = imin(sz,4);
+      d64   = getSDisp(d_sz, delta + am_sz);
+      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
+      break;
+
+   case 0x83: /* Grp1 Ib,Ev */
+      if (haveF2orF3(pfx)) goto decode_failure;
+      modrm = getUChar(delta);
+      am_sz = lengthAMode(pfx,delta);
+      d_sz  = 1;
+      d64   = getSDisp8(delta + am_sz);
+      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
+      break;
+
+   /* ------------------------ (Grp2 extensions) ---------- */
+
+   case 0xC0: { /* Grp2 Ib,Eb */
+      Bool decode_OK = True;
+      if (haveF2orF3(pfx)) goto decode_failure;
+      modrm = getUChar(delta);
+      am_sz = lengthAMode(pfx,delta);
+      d_sz  = 1;
+      d64   = getUChar(delta + am_sz);
+      sz    = 1;
+      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, 
+                         mkU8(d64 & 0xFF), NULL, &decode_OK );
+      if (!decode_OK) goto decode_failure;
+      break;
+   }
+   case 0xC1: { /* Grp2 Ib,Ev */
+      Bool decode_OK = True;
+      if (haveF2orF3(pfx)) goto decode_failure;
+      modrm = getUChar(delta);
+      am_sz = lengthAMode(pfx,delta);
+      d_sz  = 1;
+      d64   = getUChar(delta + am_sz);
+      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, 
+                         mkU8(d64 & 0xFF), NULL, &decode_OK );
+      if (!decode_OK) goto decode_failure;
+      break;
+   }
+   case 0xD0: { /* Grp2 1,Eb */
+      Bool decode_OK = True;
+      if (haveF2orF3(pfx)) goto decode_failure;
+      modrm = getUChar(delta);
+      am_sz = lengthAMode(pfx,delta);
+      d_sz  = 0;
+      d64   = 1;
+      sz    = 1;
+      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, 
+                         mkU8(d64), NULL, &decode_OK );
+      if (!decode_OK) goto decode_failure;
+      break;
+   }
+   case 0xD1: { /* Grp2 1,Ev */
+      Bool decode_OK = True;
+      if (haveF2orF3(pfx)) goto decode_failure;
+      modrm = getUChar(delta);
+      am_sz = lengthAMode(pfx,delta);
+      d_sz  = 0;
+      d64   = 1;
+      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, 
+                         mkU8(d64), NULL, &decode_OK );
+      if (!decode_OK) goto decode_failure;
+      break;
+   }
+   case 0xD2: { /* Grp2 CL,Eb */
+      Bool decode_OK = True;
+      if (haveF2orF3(pfx)) goto decode_failure;
+      modrm = getUChar(delta);
+      am_sz = lengthAMode(pfx,delta);
+      d_sz  = 0;
+      sz    = 1;
+      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, 
+                         getIRegCL(), "%cl", &decode_OK );
+      if (!decode_OK) goto decode_failure;
+      break;
+   }
+   case 0xD3: { /* Grp2 CL,Ev */
+      Bool decode_OK = True;
+      if (haveF2orF3(pfx)) goto decode_failure;
+      modrm = getUChar(delta);
+      am_sz = lengthAMode(pfx,delta);
+      d_sz  = 0;
+      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, 
+                         getIRegCL(), "%cl", &decode_OK );
+      if (!decode_OK) goto decode_failure;
+      break;
+   }
+
+   /* ------------------------ (Grp3 extensions) ---------- */
+
+   case 0xF6: { /* Grp3 Eb */
+      Bool decode_OK = True;
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
+      if (!decode_OK) goto decode_failure;
+      break;
+   }
+   case 0xF7: { /* Grp3 Ev */
+      Bool decode_OK = True;
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
+      if (!decode_OK) goto decode_failure;
+      break;
+   }
+
+   /* ------------------------ (Grp4 extensions) ---------- */
+
+   case 0xFE: { /* Grp4 Eb */
+      Bool decode_OK = True;
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
+      if (!decode_OK) goto decode_failure;
+      break;
+   }
+
+   /* ------------------------ (Grp5 extensions) ---------- */
+
+   case 0xFF: { /* Grp5 Ev */
+      Bool decode_OK = True;
+      if (haveF2orF3(pfx)) goto decode_failure;
+      delta = dis_Grp5 ( vbi, pfx, sz, delta, &dres, &decode_OK );
+      if (!decode_OK) goto decode_failure;
+      break;
+   }
+
+   /* ------------------------ Escapes to 2-byte opcodes -- */
+
+   case 0x0F: {
+      opc = getUChar(delta); delta++;
+      switch (opc) {
+
+      /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
+
+      case 0xBA: { /* Grp8 Ib,Ev */
+         Bool decode_OK = False;
+         if (haveF2orF3(pfx)) goto decode_failure;
+         modrm = getUChar(delta);
+         am_sz = lengthAMode(pfx,delta);
+         d64   = getSDisp8(delta + am_sz);
+         delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
+                                &decode_OK );
+         if (!decode_OK)
+            goto decode_failure;
+         break;
+      }
+
+      /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
+
+      case 0xBC: /* BSF Gv,Ev */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
+         break;
+      case 0xBD: /* BSR Gv,Ev */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
+
+      case 0xC8: /* BSWAP %eax */
+      case 0xC9:
+      case 0xCA:
+      case 0xCB:
+      case 0xCC:
+      case 0xCD:
+      case 0xCE:
+      case 0xCF: /* BSWAP %edi */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         /* According to the AMD64 docs, this insn can have size 4 or
+            8. */
+         if (sz == 4) {
+            t1 = newTemp(Ity_I32);
+            t2 = newTemp(Ity_I32);
+            assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
+            assign( t2,
+               binop(Iop_Or32,
+                  binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
+               binop(Iop_Or32,
+                  binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
+                                   mkU32(0x00FF0000)),
+               binop(Iop_Or32,
+                  binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
+                                   mkU32(0x0000FF00)),
+                  binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
+                                   mkU32(0x000000FF) )
+               )))
+            );
+            putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
+            DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
+            break;
+         }
+	 else if (sz == 8) {
+            IRTemp m8  = newTemp(Ity_I64);
+            IRTemp s8  = newTemp(Ity_I64);
+            IRTemp m16 = newTemp(Ity_I64);
+            IRTemp s16 = newTemp(Ity_I64);
+            IRTemp m32 = newTemp(Ity_I64);
+            t1 = newTemp(Ity_I64);
+            t2 = newTemp(Ity_I64);
+            assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
+
+            assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
+            assign( s8,
+                    binop(Iop_Or64,
+                          binop(Iop_Shr64,
+                                binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
+                                mkU8(8)),
+                          binop(Iop_And64,
+                                binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
+                                mkexpr(m8))
+                         ) 
+                  );
+
+            assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
+            assign( s16,
+                    binop(Iop_Or64,
+                          binop(Iop_Shr64,
+                                binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
+                                mkU8(16)),
+                          binop(Iop_And64,
+                                binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
+                                mkexpr(m16))
+                         ) 
+                  );
+
+            assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
+            assign( t2,
+                    binop(Iop_Or64,
+                          binop(Iop_Shr64,
+                                binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
+                                mkU8(32)),
+                          binop(Iop_And64,
+                                binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
+                                mkexpr(m32))
+                         ) 
+                  );
+
+            putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
+            DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
+            break;
+         } else {
+            goto decode_failure;
+         }
+
+      /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
+
+      /* All of these are possible at sizes 2, 4 and 8, but until a
+         size 2 test case shows up, only handle sizes 4 and 8. */
+
+      case 0xA3: /* BT Gv,Ev */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
+         delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone );
+         break;
+      case 0xB3: /* BTR Gv,Ev */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
+         delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset );
+         break;
+      case 0xAB: /* BTS Gv,Ev */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
+         delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet );
+         break;
+      case 0xBB: /* BTC Gv,Ev */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
+         delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp );
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
+ 
+      case 0x40:
+      case 0x41:
+      case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
+      case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
+      case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
+      case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
+      case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
+      case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
+      case 0x48: /* CMOVSb (cmov negative) */
+      case 0x49: /* CMOVSb (cmov not negative) */
+      case 0x4A: /* CMOVP (cmov parity even) */
+      case 0x4B: /* CMOVNP (cmov parity odd) */
+      case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
+      case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
+      case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
+      case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
+
+      case 0xB0: { /* CMPXCHG Gb,Eb */
+         Bool ok = True;
+         if (haveF2orF3(pfx)) goto decode_failure;
+         delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
+         if (!ok) goto decode_failure;
+         break;
+      }
+      case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
+         Bool ok = True;
+         if (haveF2orF3(pfx)) goto decode_failure;
+         if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
+         delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
+         if (!ok) goto decode_failure;
+         break;
+      }
+
+      case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */
+         IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
+         IRTemp  expdHi     = newTemp(elemTy);
+         IRTemp  expdLo     = newTemp(elemTy);
+         IRTemp  dataHi     = newTemp(elemTy);
+         IRTemp  dataLo     = newTemp(elemTy);
+         IRTemp  oldHi      = newTemp(elemTy);
+         IRTemp  oldLo      = newTemp(elemTy);
+         IRTemp  flags_old  = newTemp(Ity_I64);
+         IRTemp  flags_new  = newTemp(Ity_I64);
+         IRTemp  success    = newTemp(Ity_I1);
+         IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
+         IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
+         IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
+         IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
+         IRTemp expdHi64    = newTemp(Ity_I64);
+         IRTemp expdLo64    = newTemp(Ity_I64);
+
+         /* Translate this using a DCAS, even if there is no LOCK
+            prefix.  Life is too short to bother with generating two
+            different translations for the with/without-LOCK-prefix
+            cases. */
+         *expect_CAS = True;
+
+	 /* Decode, and generate address. */
+         if (have66orF2orF3(pfx)) goto decode_failure;
+         if (sz != 4 && sz != 8) goto decode_failure;
+         if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
+            goto decode_failure;
+         modrm = getUChar(delta);
+         if (epartIsReg(modrm)) goto decode_failure;
+         if (gregLO3ofRM(modrm) != 1) goto decode_failure;
+         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+         delta += alen;
+
+         /* cmpxchg16b requires an alignment check. */
+         if (sz == 8)
+            gen_SEGV_if_not_16_aligned( addr );
+
+         /* Get the expected and new values. */
+         assign( expdHi64, getIReg64(R_RDX) );
+         assign( expdLo64, getIReg64(R_RAX) );
+
+         /* These are the correctly-sized expected and new values.
+            However, we also get expdHi64/expdLo64 above as 64-bits
+            regardless, because we will need them later in the 32-bit
+            case (paradoxically). */
+         assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
+                               : mkexpr(expdHi64) );
+         assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
+                               : mkexpr(expdLo64) );
+         assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
+         assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
+
+         /* Do the DCAS */
+         stmt( IRStmt_CAS(
+                  mkIRCAS( oldHi, oldLo, 
+                           Iend_LE, mkexpr(addr), 
+                           mkexpr(expdHi), mkexpr(expdLo),
+                           mkexpr(dataHi), mkexpr(dataLo)
+               )));
+
+         /* success when oldHi:oldLo == expdHi:expdLo */
+         assign( success,
+                 binop(opCasCmpEQ,
+                       binop(opOR,
+                             binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
+                             binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
+                       ),
+                       zero
+                 ));
+
+         /* If the DCAS is successful, that is to say oldHi:oldLo ==
+            expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
+            which is where they came from originally.  Both the actual
+            contents of these two regs, and any shadow values, are
+            unchanged.  If the DCAS fails then we're putting into
+            RDX:RAX the value seen in memory. */
+         /* Now of course there's a complication in the 32-bit case
+            (bah!): if the DCAS succeeds, we need to leave RDX:RAX
+            unchanged; but if we use the same scheme as in the 64-bit
+            case, we get hit by the standard rule that a write to the
+            bottom 32 bits of an integer register zeros the upper 32
+            bits.  And so the upper halves of RDX and RAX mysteriously
+            become zero.  So we have to stuff back in the original
+            64-bit values which we previously stashed in
+            expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
+         /* It's just _so_ much fun ... */
+         putIRegRDX( 8,
+                     IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
+                                   sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
+                                           : mkexpr(oldHi),
+                                   mkexpr(expdHi64)
+                   ));
+         putIRegRAX( 8,
+                     IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
+                                   sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
+                                           : mkexpr(oldLo),
+                                   mkexpr(expdLo64)
+                   ));
+
+         /* Copy the success bit into the Z flag and leave the others
+            unchanged */
+         assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
+         assign( 
+            flags_new,
+            binop(Iop_Or64,
+                  binop(Iop_And64, mkexpr(flags_old), 
+                                   mkU64(~AMD64G_CC_MASK_Z)),
+                  binop(Iop_Shl64,
+                        binop(Iop_And64,
+                              unop(Iop_1Uto64, mkexpr(success)), mkU64(1)), 
+                        mkU8(AMD64G_CC_SHIFT_Z)) ));
+
+         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
+         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
+         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
+         /* Set NDEP even though it isn't used.  This makes
+            redundant-PUT elimination of previous stores to this field
+            work better. */
+         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
+
+         /* Sheesh.  Aren't you glad it was me and not you that had to
+	    write and validate all this grunge? */
+
+	 DIP("cmpxchg8b %s\n", dis_buf);
+	 break;
+
+      }
+
+      /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
+
+      case 0xA2: { /* CPUID */
+         /* Uses dirty helper: 
+               void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
+            declared to mod rax, wr rbx, rcx, rdx
+         */
+         IRDirty* d     = NULL;
+         HChar*   fName = NULL;
+         void*    fAddr = NULL;
+         if (haveF2orF3(pfx)) goto decode_failure;
+         if (archinfo->hwcaps == (VEX_HWCAPS_AMD64_SSE3
+                                  |VEX_HWCAPS_AMD64_CX16)) {
+            //fName = "amd64g_dirtyhelper_CPUID_sse3_and_cx16";
+            //fAddr = &amd64g_dirtyhelper_CPUID_sse3_and_cx16; 
+            /* This is a Core-2-like machine */
+            fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
+            fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
+            /* This is a Core-i5-like machine */
+         }
+         else {
+            /* Give a CPUID for at least a baseline machine, SSE2
+               only, and no CX16 */
+            fName = "amd64g_dirtyhelper_CPUID_baseline";
+            fAddr = &amd64g_dirtyhelper_CPUID_baseline;
+         }
+
+         vassert(fName); vassert(fAddr);
+         d = unsafeIRDirty_0_N ( 0/*regparms*/, 
+                                 fName, fAddr, mkIRExprVec_0() );
+         /* declare guest state effects */
+         d->needsBBP = True;
+         d->nFxState = 4;
+         d->fxState[0].fx     = Ifx_Modify;
+         d->fxState[0].offset = OFFB_RAX;
+         d->fxState[0].size   = 8;
+         d->fxState[1].fx     = Ifx_Write;
+         d->fxState[1].offset = OFFB_RBX;
+         d->fxState[1].size   = 8;
+         d->fxState[2].fx     = Ifx_Modify;
+         d->fxState[2].offset = OFFB_RCX;
+         d->fxState[2].size   = 8;
+         d->fxState[3].fx     = Ifx_Write;
+         d->fxState[3].offset = OFFB_RDX;
+         d->fxState[3].size   = 8;
+         /* execute the dirty call, side-effecting guest state */
+         stmt( IRStmt_Dirty(d) );
+         /* CPUID is a serialising insn.  So, just in case someone is
+            using it as a memory fence ... */
+         stmt( IRStmt_MBE(Imbe_Fence) );
+         DIP("cpuid\n");
+         break;
+      }
+
+      /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
+
+      case 0xB6: /* MOVZXb Eb,Gv */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         if (sz != 2 && sz != 4 && sz != 8)
+            goto decode_failure;
+         delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
+         break;
+      case 0xB7: /* MOVZXw Ew,Gv */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         if (sz != 4 && sz != 8)
+            goto decode_failure;
+         delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
+         break;
+
+      case 0xBE: /* MOVSXb Eb,Gv */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         if (sz != 2 && sz != 4 && sz != 8)
+            goto decode_failure;
+         delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
+         break;
+      case 0xBF: /* MOVSXw Ew,Gv */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         if (sz != 4 && sz != 8)
+            goto decode_failure;
+         delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
+         break;
+
+//.. //--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
+//.. //-- 
+//.. //--       case 0xC3: /* MOVNTI Gv,Ev */
+//.. //--          vg_assert(sz == 4);
+//.. //--          modrm = getUChar(eip);
+//.. //--          vg_assert(!epartIsReg(modrm));
+//.. //--          t1 = newTemp(cb);
+//.. //--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
+//.. //--          pair = disAMode ( cb, sorb, eip, dis_buf );
+//.. //--          t2 = LOW24(pair);
+//.. //--          eip += HI8(pair);
+//.. //--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
+//.. //--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
+//.. //--          break;
+
+      /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
+
+      case 0xAF: /* IMUL Ev, Gv */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         delta = dis_mul_E_G ( vbi, pfx, sz, delta );
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
+
+      case 0x1F:
+         if (haveF2orF3(pfx)) goto decode_failure;
+         modrm = getUChar(delta);
+         if (epartIsReg(modrm)) goto decode_failure;
+         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+         delta += alen;
+         DIP("nop%c %s\n", nameISize(sz), dis_buf);
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
+      case 0x80:
+      case 0x81:
+      case 0x82: /* JBb/JNAEb (jump below) */
+      case 0x83: /* JNBb/JAEb (jump not below) */
+      case 0x84: /* JZb/JEb (jump zero) */
+      case 0x85: /* JNZb/JNEb (jump not zero) */
+      case 0x86: /* JBEb/JNAb (jump below or equal) */
+      case 0x87: /* JNBEb/JAb (jump not below or equal) */
+      case 0x88: /* JSb (jump negative) */
+      case 0x89: /* JSb (jump not negative) */
+      case 0x8A: /* JP (jump parity even) */
+      case 0x8B: /* JNP/JPO (jump parity odd) */
+      case 0x8C: /* JLb/JNGEb (jump less) */
+      case 0x8D: /* JGEb/JNLb (jump greater or equal) */
+      case 0x8E: /* JLEb/JNGb (jump less or equal) */
+      case 0x8F: /* JGb/JNLEb (jump greater) */
+       { Long   jmpDelta;
+         HChar* comment  = "";
+         if (haveF2orF3(pfx)) goto decode_failure;
+         jmpDelta = getSDisp32(delta);
+         d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
+         delta += 4;
+         if (resteerCisOk
+             && vex_control.guest_chase_cond
+             && (Addr64)d64 != (Addr64)guest_RIP_bbstart
+             && jmpDelta < 0
+             && resteerOkFn( callback_opaque, d64) ) {
+            /* Speculation: assume this backward branch is taken.  So
+               we need to emit a side-exit to the insn following this
+               one, on the negation of the condition, and continue at
+               the branch target address (d64).  If we wind up back at
+               the first instruction of the trace, just stop; it's
+               better to let the IR loop unroller handle that case. */
+            stmt( IRStmt_Exit( 
+                     mk_amd64g_calculate_condition(
+                        (AMD64Condcode)(1 ^ (opc - 0x80))),
+                     Ijk_Boring,
+                     IRConst_U64(guest_RIP_bbstart+delta) ) );
+            dres.whatNext   = Dis_ResteerC;
+            dres.continueAt = d64;
+            comment = "(assumed taken)";
+         }
+         else
+         if (resteerCisOk
+             && vex_control.guest_chase_cond
+             && (Addr64)d64 != (Addr64)guest_RIP_bbstart
+             && jmpDelta >= 0
+             && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
+            /* Speculation: assume this forward branch is not taken.
+               So we need to emit a side-exit to d64 (the dest) and
+               continue disassembling at the insn immediately
+               following this one. */
+            stmt( IRStmt_Exit( 
+                     mk_amd64g_calculate_condition((AMD64Condcode)
+                                                   (opc - 0x80)),
+                     Ijk_Boring,
+                     IRConst_U64(d64) ) );
+            dres.whatNext   = Dis_ResteerC;
+            dres.continueAt = guest_RIP_bbstart+delta;
+            comment = "(assumed not taken)";
+         }
+         else {
+            /* Conservative default translation - end the block at
+               this point. */
+            jcc_01( (AMD64Condcode)(opc - 0x80), 
+                    guest_RIP_bbstart+delta,
+                    d64 );
+            dres.whatNext = Dis_StopHere;
+         }
+         DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), d64, comment);
+         break;
+       }
+
+      /* =-=-=-=-=-=-=-=-=- PREFETCH =-=-=-=-=-=-=-=-=-= */
+      case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
+                 /* 0F 0D /1 -- prefetchw mem8 */
+         if (have66orF2orF3(pfx)) goto decode_failure;
+         modrm = getUChar(delta);
+         if (epartIsReg(modrm)) goto decode_failure;
+         if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
+            goto decode_failure;
+
+         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+         delta += alen;
+
+         switch (gregLO3ofRM(modrm)) {
+            case 0: DIP("prefetch %s\n", dis_buf); break;
+            case 1: DIP("prefetchw %s\n", dis_buf); break;
+            default: vassert(0); /*NOTREACHED*/
+         }
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
+      case 0x31: { /* RDTSC */
+         IRTemp   val  = newTemp(Ity_I64);
+         IRExpr** args = mkIRExprVec_0();
+         IRDirty* d    = unsafeIRDirty_1_N ( 
+                            val, 
+                            0/*regparms*/, 
+                            "amd64g_dirtyhelper_RDTSC", 
+                            &amd64g_dirtyhelper_RDTSC, 
+                            args 
+                         );
+         if (have66orF2orF3(pfx)) goto decode_failure;
+         /* execute the dirty call, dumping the result in val. */
+         stmt( IRStmt_Dirty(d) );
+         putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
+         putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
+         DIP("rdtsc\n");
+         break;
+      }
+
+//..       /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
+//.. 
+//..       case 0xA1: /* POP %FS */
+//..          dis_pop_segreg( R_FS, sz ); break;
+//..       case 0xA9: /* POP %GS */
+//..          dis_pop_segreg( R_GS, sz ); break;
+//.. 
+//..       case 0xA0: /* PUSH %FS */
+//..          dis_push_segreg( R_FS, sz ); break;
+//..       case 0xA8: /* PUSH %GS */
+//..          dis_push_segreg( R_GS, sz ); break;
+
+      /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
+      case 0x90:
+      case 0x91:
+      case 0x92: /* set-Bb/set-NAEb (set if below) */
+      case 0x93: /* set-NBb/set-AEb (set if not below) */
+      case 0x94: /* set-Zb/set-Eb (set if zero) */
+      case 0x95: /* set-NZb/set-NEb (set if not zero) */
+      case 0x96: /* set-BEb/set-NAb (set if below or equal) */
+      case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
+      case 0x98: /* set-Sb (set if negative) */
+      case 0x99: /* set-Sb (set if not negative) */
+      case 0x9A: /* set-P (set if parity even) */
+      case 0x9B: /* set-NP (set if parity odd) */
+      case 0x9C: /* set-Lb/set-NGEb (set if less) */
+      case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
+      case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
+      case 0x9F: /* set-Gb/set-NLEb (set if greater) */
+         if (haveF2orF3(pfx)) goto decode_failure;
+         t1 = newTemp(Ity_I8);
+         assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
+         modrm = getUChar(delta);
+         if (epartIsReg(modrm)) {
+            delta++;
+            putIRegE(1, pfx, modrm, mkexpr(t1));
+            DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), 
+                              nameIRegE(1,pfx,modrm));
+         } else {
+            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+            delta += alen;
+            storeLE( mkexpr(addr), mkexpr(t1) );
+            DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
+         }
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
+
+      case 0xA4: /* SHLDv imm8,Gv,Ev */
+         modrm = getUChar(delta);
+         d64   = delta + lengthAMode(pfx, delta);
+         vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
+         delta = dis_SHLRD_Gv_Ev ( 
+                    vbi, pfx, delta, modrm, sz, 
+                    mkU8(getUChar(d64)), True, /* literal */
+                    dis_buf, True /* left */ );
+         break;
+      case 0xA5: /* SHLDv %cl,Gv,Ev */
+         modrm = getUChar(delta);
+         delta = dis_SHLRD_Gv_Ev ( 
+                    vbi, pfx, delta, modrm, sz,
+                    getIRegCL(), False, /* not literal */
+                    "%cl", True /* left */ );
+         break;
+
+      case 0xAC: /* SHRDv imm8,Gv,Ev */
+         modrm = getUChar(delta);
+         d64   = delta + lengthAMode(pfx, delta);
+         vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
+         delta = dis_SHLRD_Gv_Ev ( 
+                    vbi, pfx, delta, modrm, sz, 
+                    mkU8(getUChar(d64)), True, /* literal */
+                    dis_buf, False /* right */ );
+         break;
+      case 0xAD: /* SHRDv %cl,Gv,Ev */
+         modrm = getUChar(delta);
+         delta = dis_SHLRD_Gv_Ev ( 
+                    vbi, pfx, delta, modrm, sz, 
+                    getIRegCL(), False, /* not literal */
+                    "%cl", False /* right */);
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- SYSCALL -=-=-=-=-=-=-=-=-=-= */
+      case 0x05: /* SYSCALL */
+         guest_RIP_next_mustcheck = True;
+         guest_RIP_next_assumed = guest_RIP_bbstart + delta;
+         putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
+         /* It's important that all guest state is up-to-date
+            at this point.  So we declare an end-of-block here, which
+            forces any cached guest state to be flushed. */
+         jmp_lit(Ijk_Sys_syscall, guest_RIP_next_assumed);
+         dres.whatNext = Dis_StopHere;
+         DIP("syscall\n");
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
+
+      case 0xC0: { /* XADD Gb,Eb */ 
+         Bool decode_OK = False;
+         delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
+         if (!decode_OK)
+            goto decode_failure;
+         break;
+      }
+      case 0xC1: { /* XADD Gv,Ev */ 
+         Bool decode_OK = False;
+         delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
+         if (!decode_OK)
+            goto decode_failure;
+         break;
+      }
+
+      /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
+
+      case 0x71: 
+      case 0x72: 
+      case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
+
+      case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
+      case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
+      case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
+      case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xFC: 
+      case 0xFD: 
+      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xEC: 
+      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xDC:
+      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xF8: 
+      case 0xF9: 
+      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xE8: 
+      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xD8: 
+      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0x74: 
+      case 0x75: 
+      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0x64: 
+      case 0x65: 
+      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0x68: 
+      case 0x69: 
+      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0x60: 
+      case 0x61: 
+      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xF2: 
+      case 0xF3: 
+
+      case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xD2: 
+      case 0xD3: 
+
+      case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xE2: 
+      {
+         Long delta0    = delta-1;
+         Bool decode_OK = False;
+
+         /* If sz==2 this is SSE, and we assume sse idec has
+            already spotted those cases by now. */
+         if (sz != 4 && sz != 8)
+            goto decode_failure;
+         if (have66orF2orF3(pfx))
+            goto decode_failure;
+
+         delta = dis_MMX ( &decode_OK, vbi, pfx, sz, delta-1 );
+         if (!decode_OK) {
+            delta = delta0;
+            goto decode_failure;
+         }
+         break;
+      }
+
+      case 0x0E: /* FEMMS */
+      case 0x77: /* EMMS */
+         if (sz != 4)
+            goto decode_failure;
+         do_EMMS_preamble();
+         DIP("{f}emms\n");
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
+      case 0x01: /* 0F 01 /0 -- SGDT */
+                 /* 0F 01 /1 -- SIDT */
+      {
+          /* This is really revolting, but ... since each processor
+             (core) only has one IDT and one GDT, just let the guest
+             see it (pass-through semantics).  I can't see any way to
+             construct a faked-up value, so don't bother to try. */
+         modrm = getUChar(delta);
+         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
+         delta += alen;
+         if (epartIsReg(modrm)) goto decode_failure;
+         if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
+            goto decode_failure;
+         switch (gregLO3ofRM(modrm)) {
+            case 0: DIP("sgdt %s\n", dis_buf); break;
+            case 1: DIP("sidt %s\n", dis_buf); break;
+            default: vassert(0); /*NOTREACHED*/
+         }
+
+         IRDirty* d = unsafeIRDirty_0_N (
+                          0/*regparms*/,
+                          "amd64g_dirtyhelper_SxDT",
+                          &amd64g_dirtyhelper_SxDT,
+                          mkIRExprVec_2( mkexpr(addr),
+                                         mkU64(gregLO3ofRM(modrm)) )
+                      );
+         /* declare we're writing memory */
+         d->mFx   = Ifx_Write;
+         d->mAddr = mkexpr(addr);
+         d->mSize = 6;
+         stmt( IRStmt_Dirty(d) );
+         break;
+      }
+
+      /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
+
+      default:
+         goto decode_failure;
+   } /* switch (opc) for the 2-byte opcodes */
+   goto decode_success;
+   } /* case 0x0F: of primary opcode */
+
+   /* ------------------------ ??? ------------------------ */
+  
+  default:
+  decode_failure:
+   /* All decode failures end up here. */
+   vex_printf("vex amd64->IR: unhandled instruction bytes: "
+              "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
+              (Int)getUChar(delta_start+0),
+              (Int)getUChar(delta_start+1),
+              (Int)getUChar(delta_start+2),
+              (Int)getUChar(delta_start+3),
+              (Int)getUChar(delta_start+4),
+              (Int)getUChar(delta_start+5) );
+
+   /* Tell the dispatcher that this insn cannot be decoded, and so has
+      not been executed, and (is currently) the next to be executed.
+      RIP should be up-to-date since it made so at the start of each
+      insn, but nevertheless be paranoid and update it again right
+      now. */
+   stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
+   jmp_lit(Ijk_NoDecode, guest_RIP_curr_instr);
+   dres.whatNext = Dis_StopHere;
+   dres.len      = 0;
+   /* We also need to say that a CAS is not expected now, regardless
+      of what it might have been set to at the start of the function,
+      since the IR that we've emitted just above (to synthesis a
+      SIGILL) does not involve any CAS, and presumably no other IR has
+      been emitted for this (non-decoded) insn. */
+   *expect_CAS = False;
+   return dres;
+
+   } /* switch (opc) for the main (primary) opcode switch. */
+
+  decode_success:
+   /* All decode successes end up here. */
+   DIP("\n");
+   dres.len = (Int)toUInt(delta - delta_start);
+   return dres;
+}
+
+#undef DIP
+#undef DIS
+
+
+/*------------------------------------------------------------*/
+/*--- Top-level fn                                         ---*/
+/*------------------------------------------------------------*/
+
+/* Disassemble a single instruction into IR.  The instruction
+   is located in host memory at &guest_code[delta]. */
+
+DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
+                           Bool         put_IP,
+                           Bool         (*resteerOkFn) ( void*, Addr64 ),
+                           Bool         resteerCisOk,
+                           void*        callback_opaque,
+                           UChar*       guest_code_IN,
+                           Long         delta,
+                           Addr64       guest_IP,
+                           VexArch      guest_arch,
+                           VexArchInfo* archinfo,
+                           VexAbiInfo*  abiinfo,
+                           Bool         host_bigendian_IN )
+{
+   Int       i, x1, x2;
+   Bool      expect_CAS, has_CAS;
+   DisResult dres;
+
+   /* Set globals (see top of this file) */
+   vassert(guest_arch == VexArchAMD64);
+   guest_code           = guest_code_IN;
+   irsb                 = irsb_IN;
+   host_is_bigendian    = host_bigendian_IN;
+   guest_RIP_curr_instr = guest_IP;
+   guest_RIP_bbstart    = guest_IP - delta;
+
+   /* We'll consult these after doing disInstr_AMD64_WRK. */
+   guest_RIP_next_assumed   = 0;
+   guest_RIP_next_mustcheck = False;
+
+   x1 = irsb_IN->stmts_used;
+   expect_CAS = False;
+   dres = disInstr_AMD64_WRK ( &expect_CAS, put_IP, resteerOkFn,
+                               resteerCisOk,
+                               callback_opaque,
+                               delta, archinfo, abiinfo );
+   x2 = irsb_IN->stmts_used;
+   vassert(x2 >= x1);
+
+   /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
+      got it right.  Failure of this assertion is serious and denotes
+      a bug in disInstr. */
+   if (guest_RIP_next_mustcheck 
+       && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
+      vex_printf("\n");
+      vex_printf("assumed next %%rip = 0x%llx\n", 
+                 guest_RIP_next_assumed );
+      vex_printf(" actual next %%rip = 0x%llx\n", 
+                 guest_RIP_curr_instr + dres.len );
+      vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
+   }
+
+   /* See comment at the top of disInstr_AMD64_WRK for meaning of
+      expect_CAS.  Here, we (sanity-)check for the presence/absence of
+      IRCAS as directed by the returned expect_CAS value. */
+   has_CAS = False;
+   for (i = x1; i < x2; i++) {
+      if (irsb_IN->stmts[i]->tag == Ist_CAS)
+         has_CAS = True;
+   }
+
+   if (expect_CAS != has_CAS) {
+      /* inconsistency detected.  re-disassemble the instruction so as
+         to generate a useful error message; then assert. */
+      vex_traceflags |= VEX_TRACE_FE;
+      dres = disInstr_AMD64_WRK ( &expect_CAS, put_IP, resteerOkFn,
+                                  resteerCisOk,
+                                  callback_opaque,
+                                  delta, archinfo, abiinfo );
+      for (i = x1; i < x2; i++) {
+         vex_printf("\t\t");
+         ppIRStmt(irsb_IN->stmts[i]);
+         vex_printf("\n");
+      }
+      /* Failure of this assertion is serious and denotes a bug in
+         disInstr. */
+      vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
+   }
+
+   return dres;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Unused stuff                                         ---*/
+/*------------------------------------------------------------*/
+
+// A potentially more Memcheck-friendly version of gen_LZCNT, if
+// this should ever be needed.
+//
+//static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
+//{
+//   /* Scheme is simple: propagate the most significant 1-bit into all
+//      lower positions in the word.  This gives a word of the form
+//      0---01---1.  Now invert it, giving a word of the form
+//      1---10---0, then do a population-count idiom (to count the 1s,
+//      which is the number of leading zeroes, or the word size if the
+//      original word was 0.
+//   */
+//   Int i;
+//   IRTemp t[7];
+//   for (i = 0; i < 7; i++) {
+//      t[i] = newTemp(ty);
+//   }
+//   if (ty == Ity_I64) {
+//      assign(t[0], binop(Iop_Or64, mkexpr(src),
+//                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
+//      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
+//                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
+//      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
+//                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
+//      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
+//                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
+//      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
+//                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
+//      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
+//                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
+//      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
+//      return gen_POPCOUNT(ty, t[6]);
+//   }
+//   if (ty == Ity_I32) {
+//      assign(t[0], binop(Iop_Or32, mkexpr(src),
+//                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
+//      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
+//                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
+//      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
+//                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
+//      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
+//                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
+//      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
+//                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
+//      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
+//      return gen_POPCOUNT(ty, t[5]);
+//   }
+//   if (ty == Ity_I16) {
+//      assign(t[0], binop(Iop_Or16, mkexpr(src),
+//                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
+//      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
+//                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
+//      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
+//                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
+//      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
+//                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
+//      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
+//      return gen_POPCOUNT(ty, t[4]);
+//   }
+//   vassert(0);
+//}
+
+
+/*--------------------------------------------------------------------*/
+/*--- end                                       guest_amd64_toIR.c ---*/
+/*--------------------------------------------------------------------*/

diff --git a/VEX/priv/guest_arm_defs.h b/VEX/priv/guest_arm_defs.h
new file mode 100644
index 0000000..02078c4
--- /dev/null
+++ b/VEX/priv/guest_arm_defs.h

@@ -0,0 +1,238 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                  guest_arm_defs.h ---*/
+/*---------------------------------------------------------------*/
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Only to be used within the guest-arm directory. */
+
+#ifndef __VEX_GUEST_ARM_DEFS_H
+#define __VEX_GUEST_ARM_DEFS_H
+
+
+/*---------------------------------------------------------*/
+/*--- arm to IR conversion                              ---*/
+/*---------------------------------------------------------*/
+
+/* Convert one ARM insn to IR.  See the type DisOneInstrFn in
+   bb_to_IR.h. */
+extern
+DisResult disInstr_ARM ( IRSB*        irbb,
+                         Bool         put_IP,
+                         Bool         (*resteerOkFn) ( void*, Addr64 ),
+                         Bool         resteerCisOk,
+                         void*        callback_opaque,
+                         UChar*       guest_code,
+                         Long         delta,
+                         Addr64       guest_IP,
+                         VexArch      guest_arch,
+                         VexArchInfo* archinfo,
+                         VexAbiInfo*  abiinfo,
+                         Bool         host_bigendian );
+
+/* Used by the optimiser to specialise calls to helpers. */
+extern
+IRExpr* guest_arm_spechelper ( HChar*   function_name,
+                               IRExpr** args,
+                               IRStmt** precedingStmts,
+                               Int      n_precedingStmts );
+
+/* Describes to the optimser which part of the guest state require
+   precise memory exceptions.  This is logically part of the guest
+   state description. */
+extern 
+Bool guest_arm_state_requires_precise_mem_exns ( Int, Int );
+
+extern
+VexGuestLayout armGuest_layout;
+
+
+/*---------------------------------------------------------*/
+/*--- arm guest helpers                                 ---*/
+/*---------------------------------------------------------*/
+
+/* --- CLEAN HELPERS --- */
+
+/* Calculate NZCV from the supplied thunk components, in the positions
+   they appear in the CPSR, viz bits 31:28 for N Z V C respectively.
+   Returned bits 27:0 are zero. */
+extern 
+UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1,
+                                 UInt cc_dep2, UInt cc_dep3 );
+
+/* Calculate the C flag from the thunk components, in the lowest bit
+   of the word (bit 0). */
+extern 
+UInt armg_calculate_flag_c ( UInt cc_op, UInt cc_dep1,
+                             UInt cc_dep2, UInt cc_dep3 );
+
+/* Calculate the V flag from the thunk components, in the lowest bit
+   of the word (bit 0). */
+extern 
+UInt armg_calculate_flag_v ( UInt cc_op, UInt cc_dep1,
+                             UInt cc_dep2, UInt cc_dep3 );
+
+/* Calculate the specified condition from the thunk components, in the
+   lowest bit of the word (bit 0). */
+extern 
+UInt armg_calculate_condition ( UInt cond_n_op /* ARMCondcode << 4 | cc_op */,
+                                UInt cc_dep1,
+                                UInt cc_dep2, UInt cc_dep3 );
+
+/* Calculate the QC flag from the thunk components, in the lowest bit
+   of the word (bit 0). */
+extern 
+UInt armg_calculate_flag_qc ( UInt resL1, UInt resL2,
+                              UInt resR1, UInt resR2 );
+
+
+/*---------------------------------------------------------*/
+/*--- Condition code stuff                              ---*/
+/*---------------------------------------------------------*/
+
+/* Flags masks.  Defines positions of flags bits in the CPSR. */
+#define ARMG_CC_SHIFT_N  31
+#define ARMG_CC_SHIFT_Z  30
+#define ARMG_CC_SHIFT_C  29
+#define ARMG_CC_SHIFT_V  28
+#define ARMG_CC_SHIFT_Q  27
+
+#define ARMG_CC_MASK_N    (1 << ARMG_CC_SHIFT_N)
+#define ARMG_CC_MASK_Z    (1 << ARMG_CC_SHIFT_Z)
+#define ARMG_CC_MASK_C    (1 << ARMG_CC_SHIFT_C)
+#define ARMG_CC_MASK_V    (1 << ARMG_CC_SHIFT_V)
+#define ARMG_CC_MASK_Q    (1 << ARMG_CC_SHIFT_Q)
+
+/* Flag thunk descriptors.  A four-word thunk is used to record
+   details of the most recent flag-setting operation, so NZCV can
+   be computed later if needed.
+
+   The four words are:
+
+      CC_OP, which describes the operation.
+
+      CC_DEP1, CC_DEP2, CC_DEP3.  These are arguments to the
+         operation.  We want set up the mcx_masks in flag helper calls
+         involving these fields so that Memcheck "believes" that the
+         resulting flags are data-dependent on both CC_DEP1 and
+         CC_DEP2.  Hence the name DEP.
+
+   When building the thunk, it is always necessary to write words into
+   CC_DEP1/2/3, even if those args are not used given the
+   CC_OP field.  This is important because otherwise Memcheck could
+   give false positives as it does not understand the relationship
+   between the CC_OP field and CC_DEP1/2/3, and so believes
+   that the definedness of the stored flags always depends on
+   all 3 DEP values.
+
+   A summary of the field usages is:
+
+   OP                DEP1              DEP2              DEP3
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   OP_COPY           current NZCV      unused            unused
+   OP_ADD            argL              argR              unused
+   OP_SUB            argL              argR              unused
+   OP_ADC            argL              argR              old_C
+   OP_SBB            argL              argR              old_C
+   OP_LOGIC          result            shifter_co        old_V
+   OP_MUL            result            unused            old_C:old_V
+   OP_MULL           resLO32           resHI32           old_C:old_V
+*/
+
+enum {
+   ARMG_CC_OP_COPY=0,  /* DEP1 = NZCV in 31:28, DEP2 = 0, DEP3 = 0
+                          just copy DEP1 to output */
+
+   ARMG_CC_OP_ADD,     /* DEP1 = argL (Rn), DEP2 = argR (shifter_op),
+                          DEP3 = 0 */
+
+   ARMG_CC_OP_SUB,     /* DEP1 = argL (Rn), DEP2 = argR (shifter_op),
+                          DEP3 = 0 */
+
+   ARMG_CC_OP_ADC,     /* DEP1 = argL (Rn), DEP2 = arg2 (shifter_op),
+                          DEP3 = oldC (in LSB) */
+
+   ARMG_CC_OP_SBB,     /* DEP1 = argL (Rn), DEP2 = arg2 (shifter_op),
+                          DEP3 = oldC (in LSB) */
+
+   ARMG_CC_OP_LOGIC,   /* DEP1 = result, DEP2 = shifter_carry_out (in LSB),
+                          DEP3 = old V flag (in LSB) */
+
+   ARMG_CC_OP_MUL,     /* DEP1 = result, DEP2 = 0, DEP3 = oldC:old_V
+                          (in bits 1:0) */
+
+   ARMG_CC_OP_MULL,    /* DEP1 = resLO32, DEP2 = resHI32, DEP3 = oldC:old_V
+                          (in bits 1:0) */
+
+   ARMG_CC_OP_NUMBER
+};
+
+/* XXXX because of the calling conventions for
+   armg_calculate_condition, all this OP values MUST be in the range
+   0 .. 15 only (viz, 4-bits). */
+
+
+
+/* Defines conditions which we can ask for (ARM ARM 2e page A3-6) */
+
+typedef
+   enum {
+      ARMCondEQ     = 0,  /* equal                         : Z=1 */
+      ARMCondNE     = 1,  /* not equal                     : Z=0 */
+
+      ARMCondHS     = 2,  /* >=u (higher or same)          : C=1 */
+      ARMCondLO     = 3,  /* <u  (lower)                   : C=0 */
+
+      ARMCondMI     = 4,  /* minus (negative)              : N=1 */
+      ARMCondPL     = 5,  /* plus (zero or +ve)            : N=0 */
+
+      ARMCondVS     = 6,  /* overflow                      : V=1 */
+      ARMCondVC     = 7,  /* no overflow                   : V=0 */
+
+      ARMCondHI     = 8,  /* >u   (higher)                 : C=1 && Z=0 */
+      ARMCondLS     = 9,  /* <=u  (lower or same)          : C=0 || Z=1 */
+
+      ARMCondGE     = 10, /* >=s (signed greater or equal) : N=V */
+      ARMCondLT     = 11, /* <s  (signed less than)        : N!=V */
+
+      ARMCondGT     = 12, /* >s  (signed greater)          : Z=0 && N=V */
+      ARMCondLE     = 13, /* <=s (signed less or equal)    : Z=1 || N!=V */
+
+      ARMCondAL     = 14, /* always (unconditional)        : 1 */
+      ARMCondNV     = 15  /* never (unconditional):        : 0 */
+      /* NB: ARM have deprecated the use of the NV condition code.
+         You are now supposed to use MOV R0,R0 as a noop rather than
+         MOVNV R0,R0 as was previously recommended.  Future processors
+         may have the NV condition code reused to do other things.  */
+   }
+   ARMCondcode;
+
+#endif /* ndef __VEX_GUEST_ARM_DEFS_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                    guest_arm_defs.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/guest_arm_helpers.c b/VEX/priv/guest_arm_helpers.c
new file mode 100644
index 0000000..f6689a0
--- /dev/null
+++ b/VEX/priv/guest_arm_helpers.c

@@ -0,0 +1,701 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                               guest_arm_helpers.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_emwarn.h"
+#include "libvex_guest_arm.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+
+#include "main_util.h"
+#include "guest_generic_bb_to_IR.h"
+#include "guest_arm_defs.h"
+
+
+/* This file contains helper functions for arm guest code.  Calls to
+   these functions are generated by the back end.  These calls are of
+   course in the host machine code and this file will be compiled to
+   host machine code, so that all makes sense.
+
+   Only change the signatures of these helper functions very
+   carefully.  If you change the signature here, you'll have to change
+   the parameters passed to it in the IR calls constructed by
+   guest-arm/toIR.c.
+*/
+
+
+
+/* generalised left-shifter */
+static inline UInt lshift ( UInt x, Int n )
+{
+   if (n >= 0)
+      return x << n;
+   else
+      return x >> (-n);
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate NZCV from the supplied thunk components, in the positions
+   they appear in the CPSR, viz bits 31:28 for N Z C V respectively.
+   Returned bits 27:0 are zero. */
+UInt armg_calculate_flags_nzcv ( UInt cc_op, UInt cc_dep1,
+                                 UInt cc_dep2, UInt cc_dep3 )
+{
+   switch (cc_op) {
+      case ARMG_CC_OP_COPY:
+         /* (nzcv, unused, unused) */
+         return cc_dep1;
+      case ARMG_CC_OP_ADD: {
+         /* (argL, argR, unused) */
+         UInt argL = cc_dep1;
+         UInt argR = cc_dep2;
+         UInt res  = argL + argR;
+         UInt nf   = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
+         UInt zf   = lshift( res == 0, ARMG_CC_SHIFT_Z );
+         // CF and VF need verification
+         UInt cf   = lshift( res < argL, ARMG_CC_SHIFT_C );
+         UInt vf   = lshift( (res ^ argL) & (res ^ argR),
+                             ARMG_CC_SHIFT_V + 1 - 32 )
+                     & ARMG_CC_MASK_V;
+         //vex_printf("%08x %08x -> n %x z %x c %x v %x\n",
+         //           argL, argR, nf, zf, cf, vf);
+         return nf | zf | cf | vf;
+      }
+      case ARMG_CC_OP_SUB: {
+         /* (argL, argR, unused) */
+         UInt argL = cc_dep1;
+         UInt argR = cc_dep2;
+         UInt res  = argL - argR;
+         UInt nf   = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
+         UInt zf   = lshift( res == 0, ARMG_CC_SHIFT_Z );
+         // XXX cf is inverted relative to normal sense
+         UInt cf   = lshift( argL >= argR, ARMG_CC_SHIFT_C );
+         UInt vf   = lshift( (argL ^ argR) & (argL ^ res),
+                             ARMG_CC_SHIFT_V + 1 - 32 )
+                     & ARMG_CC_MASK_V;
+         //vex_printf("%08x %08x -> n %x z %x c %x v %x\n",
+         //           argL, argR, nf, zf, cf, vf);
+         return nf | zf | cf | vf;
+      }
+      case ARMG_CC_OP_ADC: {
+         /* (argL, argR, oldC) */
+         UInt argL = cc_dep1;
+         UInt argR = cc_dep2;
+         UInt oldC = cc_dep3;
+         UInt res  = (argL + argR) + oldC;
+         UInt nf   = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
+         UInt zf   = lshift( res == 0, ARMG_CC_SHIFT_Z );
+         UInt cf   = oldC ? lshift( res <= argL, ARMG_CC_SHIFT_C )
+                          : lshift( res <  argL, ARMG_CC_SHIFT_C );
+         UInt vf   = lshift( (res ^ argL) & (res ^ argR),
+                             ARMG_CC_SHIFT_V + 1 - 32 )
+                     & ARMG_CC_MASK_V;
+         //vex_printf("%08x %08x -> n %x z %x c %x v %x\n",
+         //           argL, argR, nf, zf, cf, vf);
+         return nf | zf | cf | vf;
+      }
+      case ARMG_CC_OP_SBB: {
+         /* (argL, argR, oldC) */
+         UInt argL = cc_dep1;
+         UInt argR = cc_dep2;
+         UInt oldC = cc_dep3;
+         UInt res  = argL - argR - (oldC ^ 1);
+         UInt nf   = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
+         UInt zf   = lshift( res == 0, ARMG_CC_SHIFT_Z );
+         UInt cf   = oldC ? lshift( argL >= argR, ARMG_CC_SHIFT_C )
+                          : lshift( argL >  argR, ARMG_CC_SHIFT_C );
+         UInt vf   = lshift( (argL ^ argR) & (argL ^ res),
+                             ARMG_CC_SHIFT_V + 1 - 32 )
+                     & ARMG_CC_MASK_V;
+         //vex_printf("%08x %08x -> n %x z %x c %x v %x\n",
+         //           argL, argR, nf, zf, cf, vf);
+         return nf | zf | cf | vf;
+      }
+      case ARMG_CC_OP_LOGIC: {
+         /* (res, shco, oldV) */
+         UInt res  = cc_dep1;
+         UInt shco = cc_dep2;
+         UInt oldV = cc_dep3;
+         UInt nf   = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
+         UInt zf   = lshift( res == 0, ARMG_CC_SHIFT_Z );
+         UInt cf   = lshift( shco & 1, ARMG_CC_SHIFT_C );
+         UInt vf   = lshift( oldV & 1, ARMG_CC_SHIFT_V );
+         return nf | zf | cf | vf;
+      }
+      case ARMG_CC_OP_MUL: {
+         /* (res, unused, oldC:oldV) */
+         UInt res  = cc_dep1;
+         UInt oldC = (cc_dep3 >> 1) & 1;
+         UInt oldV = (cc_dep3 >> 0) & 1;
+         UInt nf   = lshift( res & (1<<31), ARMG_CC_SHIFT_N - 31 );
+         UInt zf   = lshift( res == 0, ARMG_CC_SHIFT_Z );
+         UInt cf   = lshift( oldC & 1, ARMG_CC_SHIFT_C );
+         UInt vf   = lshift( oldV & 1, ARMG_CC_SHIFT_V );
+         return nf | zf | cf | vf;
+      }
+      case ARMG_CC_OP_MULL: {
+         /* (resLo32, resHi32, oldC:oldV) */
+         UInt resLo32 = cc_dep1;
+         UInt resHi32 = cc_dep2;
+         UInt oldC    = (cc_dep3 >> 1) & 1;
+         UInt oldV    = (cc_dep3 >> 0) & 1;
+         UInt nf      = lshift( resHi32 & (1<<31), ARMG_CC_SHIFT_N - 31 );
+         UInt zf      = lshift( (resHi32|resLo32) == 0, ARMG_CC_SHIFT_Z );
+         UInt cf      = lshift( oldC & 1, ARMG_CC_SHIFT_C );
+         UInt vf      = lshift( oldV & 1, ARMG_CC_SHIFT_V );
+         return nf | zf | cf | vf;
+      }
+      default:
+         /* shouldn't really make these calls from generated code */
+         vex_printf("armg_calculate_flags_nzcv"
+                    "( op=%u, dep1=0x%x, dep2=0x%x, dep3=0x%x )\n",
+                    cc_op, cc_dep1, cc_dep2, cc_dep3 );
+         vpanic("armg_calculate_flags_nzcv");
+   }
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate the C flag from the thunk components, in the lowest bit
+   of the word (bit 0). */
+UInt armg_calculate_flag_c ( UInt cc_op, UInt cc_dep1,
+                             UInt cc_dep2, UInt cc_dep3 )
+{
+   UInt r = armg_calculate_flags_nzcv(cc_op, cc_dep1, cc_dep2, cc_dep3);
+   return (r >> ARMG_CC_SHIFT_C) & 1;
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate the V flag from the thunk components, in the lowest bit
+   of the word (bit 0). */
+UInt armg_calculate_flag_v ( UInt cc_op, UInt cc_dep1,
+                             UInt cc_dep2, UInt cc_dep3 )
+{
+   UInt r = armg_calculate_flags_nzcv(cc_op, cc_dep1, cc_dep2, cc_dep3);
+   return (r >> ARMG_CC_SHIFT_V) & 1;
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate the QC flag from the arguments, in the lowest bit
+   of the word (bit 0).  Urr, having this out of line is bizarre.
+   Push back inline. */
+UInt armg_calculate_flag_qc ( UInt resL1, UInt resL2,
+                              UInt resR1, UInt resR2 )
+{
+   if (resL1 != resR1 || resL2 != resR2)
+      return 1;
+   else
+      return 0;
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate the specified condition from the thunk components, in the
+   lowest bit of the word (bit 0). */
+extern 
+UInt armg_calculate_condition ( UInt cond_n_op /* ARMCondcode << 4 | cc_op */,
+                                UInt cc_dep1,
+                                UInt cc_dep2, UInt cc_dep3 )
+{
+   UInt cond  = cond_n_op >> 4;
+   UInt cc_op = cond_n_op & 0xF;
+   UInt nf, zf, vf, cf, nzcv, inv;
+   //   vex_printf("XXXXXXXX %x %x %x %x\n", 
+   //              cond_n_op, cc_dep1, cc_dep2, cc_dep3);
+
+   // skip flags computation in this case
+   if (cond == ARMCondAL) return 1;
+
+   inv  = cond & 1;
+   nzcv = armg_calculate_flags_nzcv(cc_op, cc_dep1, cc_dep2, cc_dep3);
+
+   switch (cond) {
+      case ARMCondEQ:    // Z=1         => z
+      case ARMCondNE:    // Z=0
+         zf = nzcv >> ARMG_CC_SHIFT_Z;
+         return 1 & (inv ^ zf);
+
+      case ARMCondHS:    // C=1         => c
+      case ARMCondLO:    // C=0
+         cf = nzcv >> ARMG_CC_SHIFT_C;
+         return 1 & (inv ^ cf);
+
+      case ARMCondMI:    // N=1         => n
+      case ARMCondPL:    // N=0
+         nf = nzcv >> ARMG_CC_SHIFT_N;
+         return 1 & (inv ^ nf);
+
+      case ARMCondVS:    // V=1         => v
+      case ARMCondVC:    // V=0
+         vf = nzcv >> ARMG_CC_SHIFT_V;
+         return 1 & (inv ^ vf);
+
+      case ARMCondHI:    // C=1 && Z=0   => c & ~z
+      case ARMCondLS:    // C=0 || Z=1
+         cf = nzcv >> ARMG_CC_SHIFT_C;
+         zf = nzcv >> ARMG_CC_SHIFT_Z;
+         return 1 & (inv ^ (cf & ~zf));
+
+      case ARMCondGE:    // N=V          => ~(n^v)
+      case ARMCondLT:    // N!=V
+         nf = nzcv >> ARMG_CC_SHIFT_N;
+         vf = nzcv >> ARMG_CC_SHIFT_V;
+         return 1 & (inv ^ ~(nf ^ vf));
+
+      case ARMCondGT:    // Z=0 && N=V   => ~z & ~(n^v)  =>  ~(z | (n^v))
+      case ARMCondLE:    // Z=1 || N!=V
+         nf = nzcv >> ARMG_CC_SHIFT_N;
+         vf = nzcv >> ARMG_CC_SHIFT_V;
+         zf = nzcv >> ARMG_CC_SHIFT_Z;
+         return 1 & (inv ^ ~(zf | (nf ^ vf)));
+
+      case ARMCondAL: // handled above
+      case ARMCondNV: // should never get here: Illegal instr
+      default:
+         /* shouldn't really make these calls from generated code */
+         vex_printf("armg_calculate_condition(ARM)"
+                    "( %u, %u, 0x%x, 0x%x, 0x%x )\n",
+                    cond, cc_op, cc_dep1, cc_dep2, cc_dep3 );
+         vpanic("armg_calculate_condition(ARM)");
+   }
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Flag-helpers translation-time function specialisers.    ---*/
+/*--- These help iropt specialise calls the above run-time    ---*/
+/*--- flags functions.                                        ---*/
+/*---------------------------------------------------------------*/
+
+/* Used by the optimiser to try specialisations.  Returns an
+   equivalent expression, or NULL if none. */
+
+static Bool isU32 ( IRExpr* e, UInt n )
+{
+   return
+      toBool( e->tag == Iex_Const
+              && e->Iex.Const.con->tag == Ico_U32
+              && e->Iex.Const.con->Ico.U32 == n );
+}
+
+IRExpr* guest_arm_spechelper ( HChar*   function_name,
+                               IRExpr** args,
+                               IRStmt** precedingStmts,
+                               Int      n_precedingStmts )
+{
+#  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
+#  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
+#  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
+#  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
+
+   Int i, arity = 0;
+   for (i = 0; args[i]; i++)
+      arity++;
+#  if 0
+   vex_printf("spec request:\n");
+   vex_printf("   %s  ", function_name);
+   for (i = 0; i < arity; i++) {
+      vex_printf("  ");
+      ppIRExpr(args[i]);
+   }
+   vex_printf("\n");
+#  endif
+
+   /* --------- specialising "armg_calculate_condition" --------- */
+
+   if (vex_streq(function_name, "armg_calculate_condition")) {
+      /* specialise calls to above "armg_calculate condition" function */
+      IRExpr *cond_n_op, *cc_dep1, *cc_dep2, *cc_dep3;
+      vassert(arity == 4);
+      cond_n_op = args[0]; /* ARMCondcode << 4  |  ARMG_CC_OP_* */
+      cc_dep1   = args[1];
+      cc_dep2   = args[2];
+      cc_dep3   = args[3];
+
+      /*---------------- SUB ----------------*/
+
+      if (isU32(cond_n_op, (ARMCondEQ << 4) | ARMG_CC_OP_SUB)) {
+         /* EQ after SUB --> test argL == argR */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpEQ32, cc_dep1, cc_dep2));
+      }
+      if (isU32(cond_n_op, (ARMCondNE << 4) | ARMG_CC_OP_SUB)) {
+         /* NE after SUB --> test argL != argR */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpNE32, cc_dep1, cc_dep2));
+      }
+
+      if (isU32(cond_n_op, (ARMCondLE << 4) | ARMG_CC_OP_SUB)) {
+         /* LE after SUB --> test argL <=s argR */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLE32S, cc_dep1, cc_dep2));
+      }
+
+      if (isU32(cond_n_op, (ARMCondLT << 4) | ARMG_CC_OP_SUB)) {
+         /* LT after SUB --> test argL <s argR */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLT32S, cc_dep1, cc_dep2));
+      }
+
+      if (isU32(cond_n_op, (ARMCondGE << 4) | ARMG_CC_OP_SUB)) {
+         /* GE after SUB --> test argL >=s argR
+                         --> test argR <=s argL */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLE32S, cc_dep2, cc_dep1));
+      }
+
+      if (isU32(cond_n_op, (ARMCondHS << 4) | ARMG_CC_OP_SUB)) {
+         /* HS after SUB --> test argL >=u argR
+                         --> test argR <=u argL */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLE32U, cc_dep2, cc_dep1));
+      }
+
+      if (isU32(cond_n_op, (ARMCondLS << 4) | ARMG_CC_OP_SUB)) {
+         /* LS after SUB --> test argL <=u argR */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLE32U, cc_dep1, cc_dep2));
+      }
+
+      /*---------------- LOGIC ----------------*/
+      if (isU32(cond_n_op, (ARMCondEQ << 4) | ARMG_CC_OP_LOGIC)) {
+         /* EQ after LOGIC --> test res == 0 */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
+      }
+      if (isU32(cond_n_op, (ARMCondNE << 4) | ARMG_CC_OP_LOGIC)) {
+         /* NE after LOGIC --> test res != 0 */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpNE32, cc_dep1, mkU32(0)));
+      }
+
+      /*----------------- AL -----------------*/
+      /* A critically important case for Thumb code.
+
+         What we're trying to spot is the case where cond_n_op is an
+         expression of the form Or32(..., 0xE0) since that means the
+         caller is asking for CondAL and we can simply return 1
+         without caring what the ... part is.  This is a potentially
+         dodgy kludge in that it assumes that the ... part has zeroes
+         in bits 7:4, so that the result of the Or32 is guaranteed to
+         be 0xE in bits 7:4.  Given that the places where this first
+         arg are constructed (in guest_arm_toIR.c) are very
+         constrained, we can get away with this.  To make this
+         guaranteed safe would require to have a new primop, Slice44
+         or some such, thusly
+
+         Slice44(arg1, arg2) = 0--(24)--0 arg1[7:4] arg2[3:0]
+
+         and we would then look for Slice44(0xE0, ...)
+         which would give the required safety property.
+
+         It would be infeasibly expensive to scan backwards through
+         the entire block looking for an assignment to the temp, so
+         just look at the previous 16 statements.  That should find it
+         if it is an interesting case, as a result of how the
+         boilerplate guff at the start of each Thumb insn translation
+         is made.
+      */
+      if (cond_n_op->tag == Iex_RdTmp) {
+         Int    j;
+         IRTemp look_for = cond_n_op->Iex.RdTmp.tmp;
+         Int    limit    = n_precedingStmts - 16;
+         if (limit < 0) limit = 0;
+         if (0) vex_printf("scanning %d .. %d\n", n_precedingStmts-1, limit);
+         for (j = n_precedingStmts - 1; j >= limit; j--) {
+            IRStmt* st = precedingStmts[j];
+            if (st->tag == Ist_WrTmp
+                && st->Ist.WrTmp.tmp == look_for
+                && st->Ist.WrTmp.data->tag == Iex_Binop
+                && st->Ist.WrTmp.data->Iex.Binop.op == Iop_Or32
+                && isU32(st->Ist.WrTmp.data->Iex.Binop.arg2, (ARMCondAL << 4)))
+               return mkU32(1);
+         }
+         /* Didn't find any useful binding to the first arg
+            in the previous 16 stmts. */
+      }
+   }
+
+#  undef unop
+#  undef binop
+#  undef mkU32
+#  undef mkU8
+
+   return NULL;
+}
+
+
+/*----------------------------------------------*/
+/*--- The exported fns ..                    ---*/
+/*----------------------------------------------*/
+
+/* VISIBLE TO LIBVEX CLIENT */
+#if 0
+void LibVEX_GuestARM_put_flags ( UInt flags_native,
+                                 /*OUT*/VexGuestARMState* vex_state )
+{
+   vassert(0); // FIXME
+
+   /* Mask out everything except N Z V C. */
+   flags_native
+      &= (ARMG_CC_MASK_N | ARMG_CC_MASK_Z | ARMG_CC_MASK_V | ARMG_CC_MASK_C);
+   
+   vex_state->guest_CC_OP   = ARMG_CC_OP_COPY;
+   vex_state->guest_CC_DEP1 = flags_native;
+   vex_state->guest_CC_DEP2 = 0;
+   vex_state->guest_CC_NDEP = 0;
+}
+#endif
+
+/* VISIBLE TO LIBVEX CLIENT */
+UInt LibVEX_GuestARM_get_cpsr ( /*IN*/VexGuestARMState* vex_state )
+{
+   UInt cpsr = 0;
+   // NZCV
+   cpsr |= armg_calculate_flags_nzcv(
+               vex_state->guest_CC_OP,
+               vex_state->guest_CC_DEP1,
+               vex_state->guest_CC_DEP2,
+               vex_state->guest_CC_NDEP
+            );
+   vassert(0 == (cpsr & 0x0FFFFFFF));
+   // Q
+   if (vex_state->guest_QFLAG32 > 0)
+      cpsr |= (1 << 27);
+   // GE
+   if (vex_state->guest_GEFLAG0 > 0)
+      cpsr |= (1 << 16);
+   if (vex_state->guest_GEFLAG1 > 0)
+      cpsr |= (1 << 17);
+   if (vex_state->guest_GEFLAG2 > 0)
+      cpsr |= (1 << 18);
+   if (vex_state->guest_GEFLAG3 > 0)
+      cpsr |= (1 << 19);
+   // M
+   cpsr |= (1 << 4); // 0b10000 means user-mode
+   // J,T   J (bit 24) is zero by initialisation above
+   // T  we copy from R15T[0]
+   if (vex_state->guest_R15T & 1)
+      cpsr |= (1 << 5);
+   // ITSTATE we punt on for the time being.  Could compute it
+   // if needed though.
+   // E, endianness, 0 (littleendian) from initialisation above
+   // A,I,F disable some async exceptions.  Not sure about these.
+   // Leave as zero for the time being.
+   return cpsr;
+}
+
+/* VISIBLE TO LIBVEX CLIENT */
+void LibVEX_GuestARM_initialise ( /*OUT*/VexGuestARMState* vex_state )
+{
+   vex_state->guest_R0  = 0;
+   vex_state->guest_R1  = 0;
+   vex_state->guest_R2  = 0;
+   vex_state->guest_R3  = 0;
+   vex_state->guest_R4  = 0;
+   vex_state->guest_R5  = 0;
+   vex_state->guest_R6  = 0;
+   vex_state->guest_R7  = 0;
+   vex_state->guest_R8  = 0;
+   vex_state->guest_R9  = 0;
+   vex_state->guest_R10 = 0;
+   vex_state->guest_R11 = 0;
+   vex_state->guest_R12 = 0;
+   vex_state->guest_R13 = 0;
+   vex_state->guest_R14 = 0;
+   vex_state->guest_R15T = 0;  /* NB: implies ARM mode */
+
+   vex_state->guest_CC_OP   = ARMG_CC_OP_COPY;
+   vex_state->guest_CC_DEP1 = 0;
+   vex_state->guest_CC_DEP2 = 0;
+   vex_state->guest_CC_NDEP = 0;
+   vex_state->guest_QFLAG32 = 0;
+   vex_state->guest_GEFLAG0 = 0;
+   vex_state->guest_GEFLAG1 = 0;
+   vex_state->guest_GEFLAG2 = 0;
+   vex_state->guest_GEFLAG3 = 0;
+
+   vex_state->guest_EMWARN  = 0;
+   vex_state->guest_TISTART = 0;
+   vex_state->guest_TILEN   = 0;
+   vex_state->guest_NRADDR  = 0;
+   vex_state->guest_IP_AT_SYSCALL = 0;
+
+   vex_state->guest_D0  = 0;
+   vex_state->guest_D1  = 0;
+   vex_state->guest_D2  = 0;
+   vex_state->guest_D3  = 0;
+   vex_state->guest_D4  = 0;
+   vex_state->guest_D5  = 0;
+   vex_state->guest_D6  = 0;
+   vex_state->guest_D7  = 0;
+   vex_state->guest_D8  = 0;
+   vex_state->guest_D9  = 0;
+   vex_state->guest_D10 = 0;
+   vex_state->guest_D11 = 0;
+   vex_state->guest_D12 = 0;
+   vex_state->guest_D13 = 0;
+   vex_state->guest_D14 = 0;
+   vex_state->guest_D15 = 0;
+   vex_state->guest_D16 = 0;
+   vex_state->guest_D17 = 0;
+   vex_state->guest_D18 = 0;
+   vex_state->guest_D19 = 0;
+   vex_state->guest_D20 = 0;
+   vex_state->guest_D21 = 0;
+   vex_state->guest_D22 = 0;
+   vex_state->guest_D23 = 0;
+   vex_state->guest_D24 = 0;
+   vex_state->guest_D25 = 0;
+   vex_state->guest_D26 = 0;
+   vex_state->guest_D27 = 0;
+   vex_state->guest_D28 = 0;
+   vex_state->guest_D29 = 0;
+   vex_state->guest_D30 = 0;
+   vex_state->guest_D31 = 0;
+
+   /* ARM encoded; zero is the default as it happens (result flags
+      (NZCV) cleared, FZ disabled, round to nearest, non-vector mode,
+      all exns masked, all exn sticky bits cleared). */
+   vex_state->guest_FPSCR = 0;
+
+   vex_state->guest_TPIDRURO = 0;
+
+   /* Not in a Thumb IT block. */
+   vex_state->guest_ITSTATE = 0;
+
+   vex_state->padding1 = 0;
+   vex_state->padding2 = 0;
+   vex_state->padding3 = 0;
+}
+
+
+/*-----------------------------------------------------------*/
+/*--- Describing the arm guest state, for the benefit     ---*/
+/*--- of iropt and instrumenters.                         ---*/
+/*-----------------------------------------------------------*/
+
+/* Figure out if any part of the guest state contained in minoff
+   .. maxoff requires precise memory exceptions.  If in doubt return
+   True (but this is generates significantly slower code).  
+
+   We enforce precise exns for guest R13(sp), R15T(pc).
+*/
+Bool guest_arm_state_requires_precise_mem_exns ( Int minoff, 
+                                                 Int maxoff)
+{
+   Int sp_min = offsetof(VexGuestARMState, guest_R13);
+   Int sp_max = sp_min + 4 - 1;
+   Int pc_min = offsetof(VexGuestARMState, guest_R15T);
+   Int pc_max = pc_min + 4 - 1;
+
+   if (maxoff < sp_min || minoff > sp_max) {
+      /* no overlap with sp */
+   } else {
+      return True;
+   }
+
+   if (maxoff < pc_min || minoff > pc_max) {
+      /* no overlap with pc */
+   } else {
+      return True;
+   }
+
+   /* We appear to need precise updates of R11 in order to get proper
+      stacktraces from non-optimised code. */
+   Int r11_min = offsetof(VexGuestARMState, guest_R11);
+   Int r11_max = r11_min + 4 - 1;
+
+   if (maxoff < r11_min || minoff > r11_max) {
+      /* no overlap with r11 */
+   } else {
+      return True;
+   }
+
+   /* Ditto R7, particularly needed for proper stacktraces in Thumb
+      code. */
+   Int r7_min = offsetof(VexGuestARMState, guest_R7);
+   Int r7_max = r7_min + 4 - 1;
+
+   if (maxoff < r7_min || minoff > r7_max) {
+      /* no overlap with r7 */
+   } else {
+      return True;
+   }
+
+   return False;
+}
+
+
+
+#define ALWAYSDEFD(field)                           \
+    { offsetof(VexGuestARMState, field),            \
+      (sizeof ((VexGuestARMState*)0)->field) }
+
+VexGuestLayout
+   armGuest_layout 
+      = { 
+          /* Total size of the guest state, in bytes. */
+          .total_sizeB = sizeof(VexGuestARMState),
+
+          /* Describe the stack pointer. */
+          .offset_SP = offsetof(VexGuestARMState,guest_R13),
+          .sizeof_SP = 4,
+
+          /* Describe the instruction pointer. */
+          .offset_IP = offsetof(VexGuestARMState,guest_R15T),
+          .sizeof_IP = 4,
+
+          /* Describe any sections to be regarded by Memcheck as
+             'always-defined'. */
+          .n_alwaysDefd = 10,
+
+          /* flags thunk: OP is always defd, whereas DEP1 and DEP2
+             have to be tracked.  See detailed comment in gdefs.h on
+             meaning of thunk fields. */
+          .alwaysDefd
+             = { /* 0 */ ALWAYSDEFD(guest_R15T),
+                 /* 1 */ ALWAYSDEFD(guest_CC_OP),
+                 /* 2 */ ALWAYSDEFD(guest_CC_NDEP),
+                 /* 3 */ ALWAYSDEFD(guest_EMWARN),
+                 /* 4 */ ALWAYSDEFD(guest_TISTART),
+                 /* 5 */ ALWAYSDEFD(guest_TILEN),
+                 /* 6 */ ALWAYSDEFD(guest_NRADDR),
+                 /* 7 */ ALWAYSDEFD(guest_IP_AT_SYSCALL),
+                 /* 8 */ ALWAYSDEFD(guest_TPIDRURO),
+                 /* 9 */ ALWAYSDEFD(guest_ITSTATE)
+               }
+        };
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                 guest_arm_helpers.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/guest_arm_toIR.c b/VEX/priv/guest_arm_toIR.c
new file mode 100644
index 0000000..c1f9211
--- /dev/null
+++ b/VEX/priv/guest_arm_toIR.c

@@ -0,0 +1,17970 @@
+
+/*--------------------------------------------------------------------*/
+/*--- begin                                       guest_arm_toIR.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   NEON support is
+   Copyright (C) 2010-2010 Samsung Electronics
+   contributed by Dmitry Zhurikhin <zhur@ispras.ru>
+              and Kirill Batuzov <batuzovk@ispras.ru>
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+/* XXXX thumb to check:
+   that all cases where putIRegT writes r15, we generate a jump.
+
+   All uses of newTemp assign to an IRTemp and not a UInt
+
+   For all thumb loads and stores, including VFP ones, new-ITSTATE is
+   backed out before the memory op, and restored afterwards.  This
+   needs to happen even after we go uncond.  (and for sure it doesn't
+   happen for VFP loads/stores right now).
+
+   VFP on thumb: check that we exclude all r13/r15 cases that we
+   should.
+
+   XXXX thumb to do: improve the ITSTATE-zeroing optimisation by
+   taking into account the number of insns guarded by an IT.
+
+   remove the nasty hack, in the spechelper, of looking for Or32(...,
+   0xE0) in as the first arg to armg_calculate_condition, and instead
+   use Slice44 as specified in comments in the spechelper.
+
+   add specialisations for armg_calculate_flag_c and _v, as they
+   are moderately often needed in Thumb code.
+
+   Correctness: ITSTATE handling in Thumb SVCs is wrong.
+
+   Correctness (obscure): in m_transtab, when invalidating code
+   address ranges, invalidate up to 18 bytes after the end of the
+   range.  This is because the ITSTATE optimisation at the top of
+   _THUMB_WRK below analyses up to 18 bytes before the start of any
+   given instruction, and so might depend on the invalidated area.
+*/
+
+/* Limitations, etc
+
+   - pretty dodgy exception semantics for {LD,ST}Mxx, no doubt
+
+   - SWP: the restart jump back is Ijk_Boring; it should be
+     Ijk_NoRedir but that's expensive.  See comments on casLE() in
+     guest_x86_toIR.c.
+*/
+
+/* "Special" instructions.
+
+   This instruction decoder can decode four special instructions
+   which mean nothing natively (are no-ops as far as regs/mem are
+   concerned) but have meaning for supporting Valgrind.  A special
+   instruction is flagged by a 16-byte preamble:
+
+      E1A0C1EC E1A0C6EC E1A0CEEC E1A0C9EC
+      (mov r12, r12, ROR #3;   mov r12, r12, ROR #13;
+       mov r12, r12, ROR #29;  mov r12, r12, ROR #19)
+
+   Following that, one of the following 3 are allowed
+   (standard interpretation in parentheses):
+
+      E18AA00A (orr r10,r10,r10)   R3 = client_request ( R4 )
+      E18BB00B (orr r11,r11,r11)   R3 = guest_NRADDR
+      E18CC00C (orr r12,r12,r12)   branch-and-link-to-noredir R4
+
+   Any other bytes following the 16-byte preamble are illegal and
+   constitute a failure in instruction decoding.  This all assumes
+   that the preamble will never occur except in specific code
+   fragments designed for Valgrind to catch.
+*/
+
+/* Translates ARM(v5) code to IR. */
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+#include "libvex_guest_arm.h"
+
+#include "main_util.h"
+#include "main_globals.h"
+#include "guest_generic_bb_to_IR.h"
+#include "guest_arm_defs.h"
+
+
+/*------------------------------------------------------------*/
+/*--- Globals                                              ---*/
+/*------------------------------------------------------------*/
+
+/* These are set at the start of the translation of a instruction, so
+   that we don't have to pass them around endlessly.  CONST means does
+   not change during translation of the instruction.
+*/
+
+/* CONST: is the host bigendian?  This has to do with float vs double
+   register accesses on VFP, but it's complex and not properly thought
+   out. */
+static Bool host_is_bigendian;
+
+/* CONST: The guest address for the instruction currently being
+   translated.  This is the real, "decoded" address (not subject
+   to the CPSR.T kludge). */
+static Addr32 guest_R15_curr_instr_notENC;
+
+/* CONST, FOR ASSERTIONS ONLY.  Indicates whether currently processed
+   insn is Thumb (True) or ARM (False). */
+static Bool __curr_is_Thumb;
+
+/* MOD: The IRSB* into which we're generating code. */
+static IRSB* irsb;
+
+/* These are to do with handling writes to r15.  They are initially
+   set at the start of disInstr_ARM_WRK to indicate no update,
+   possibly updated during the routine, and examined again at the end.
+   If they have been set to indicate a r15 update then a jump is
+   generated.  Note, "explicit" jumps (b, bx, etc) are generated
+   directly, not using this mechanism -- this is intended to handle
+   the implicit-style jumps resulting from (eg) assigning to r15 as
+   the result of insns we wouldn't normally consider branchy. */
+
+/* MOD.  Initially False; set to True iff abovementioned handling is
+   required. */
+static Bool r15written;
+
+/* MOD.  Initially IRTemp_INVALID.  If the r15 branch to be generated
+   is conditional, this holds the gating IRTemp :: Ity_I32.  If the
+   branch to be generated is unconditional, this remains
+   IRTemp_INVALID. */
+static IRTemp r15guard; /* :: Ity_I32, 0 or 1 */
+
+/* MOD.  Initially Ijk_Boring.  If an r15 branch is to be generated,
+   this holds the jump kind. */
+static IRTemp r15kind;
+
+
+/*------------------------------------------------------------*/
+/*--- Debugging output                                     ---*/
+/*------------------------------------------------------------*/
+
+#define DIP(format, args...)           \
+   if (vex_traceflags & VEX_TRACE_FE)  \
+      vex_printf(format, ## args)
+
+#define DIS(buf, format, args...)      \
+   if (vex_traceflags & VEX_TRACE_FE)  \
+      vex_sprintf(buf, format, ## args)
+
+#define ASSERT_IS_THUMB \
+   do { vassert(__curr_is_Thumb); } while (0)
+
+#define ASSERT_IS_ARM \
+   do { vassert(! __curr_is_Thumb); } while (0)
+
+
+/*------------------------------------------------------------*/
+/*--- Helper bits and pieces for deconstructing the        ---*/
+/*--- arm insn stream.                                     ---*/
+/*------------------------------------------------------------*/
+
+/* Do a little-endian load of a 32-bit word, regardless of the
+   endianness of the underlying host. */
+static inline UInt getUIntLittleEndianly ( UChar* p )
+{
+   UInt w = 0;
+   w = (w << 8) | p[3];
+   w = (w << 8) | p[2];
+   w = (w << 8) | p[1];
+   w = (w << 8) | p[0];
+   return w;
+}
+
+/* Do a little-endian load of a 16-bit word, regardless of the
+   endianness of the underlying host. */
+static inline UShort getUShortLittleEndianly ( UChar* p )
+{
+   UShort w = 0;
+   w = (w << 8) | p[1];
+   w = (w << 8) | p[0];
+   return w;
+}
+
+static UInt ROR32 ( UInt x, UInt sh ) {
+   vassert(sh >= 0 && sh < 32);
+   if (sh == 0)
+      return x;
+   else
+      return (x << (32-sh)) | (x >> sh);
+}
+
+static Int popcount32 ( UInt x )
+{
+   Int res = 0, i;
+   for (i = 0; i < 32; i++) {
+      res += (x & 1);
+      x >>= 1;
+   }
+   return res;
+}
+
+static UInt setbit32 ( UInt x, Int ix, UInt b )
+{
+   UInt mask = 1 << ix;
+   x &= ~mask;
+   x |= ((b << ix) & mask);
+   return x;
+}
+
+#define BITS2(_b1,_b0) \
+   (((_b1) << 1) | (_b0))
+
+#define BITS3(_b2,_b1,_b0)                      \
+  (((_b2) << 2) | ((_b1) << 1) | (_b0))
+
+#define BITS4(_b3,_b2,_b1,_b0) \
+   (((_b3) << 3) | ((_b2) << 2) | ((_b1) << 1) | (_b0))
+
+#define BITS8(_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
+   ((BITS4((_b7),(_b6),(_b5),(_b4)) << 4) \
+    | BITS4((_b3),(_b2),(_b1),(_b0)))
+
+#define BITS5(_b4,_b3,_b2,_b1,_b0)  \
+   (BITS8(0,0,0,(_b4),(_b3),(_b2),(_b1),(_b0)))
+#define BITS6(_b5,_b4,_b3,_b2,_b1,_b0)  \
+   (BITS8(0,0,(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
+#define BITS7(_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
+   (BITS8(0,(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
+
+#define BITS9(_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)      \
+   (((_b8) << 8) \
+    | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
+
+#define BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
+   (((_b9) << 9) | ((_b8) << 8)                                \
+    | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
+
+/* produces _uint[_bMax:_bMin] */
+#define SLICE_UInt(_uint,_bMax,_bMin) \
+   (( ((UInt)(_uint)) >> (_bMin)) \
+    & (UInt)((1ULL << ((_bMax) - (_bMin) + 1)) - 1ULL))
+
+
+/*------------------------------------------------------------*/
+/*--- Helper bits and pieces for creating IR fragments.    ---*/
+/*------------------------------------------------------------*/
+
+static IRExpr* mkU64 ( ULong i )
+{
+   return IRExpr_Const(IRConst_U64(i));
+}
+
+static IRExpr* mkU32 ( UInt i )
+{
+   return IRExpr_Const(IRConst_U32(i));
+}
+
+static IRExpr* mkU8 ( UInt i )
+{
+   vassert(i < 256);
+   return IRExpr_Const(IRConst_U8( (UChar)i ));
+}
+
+static IRExpr* mkexpr ( IRTemp tmp )
+{
+   return IRExpr_RdTmp(tmp);
+}
+
+static IRExpr* unop ( IROp op, IRExpr* a )
+{
+   return IRExpr_Unop(op, a);
+}
+
+static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
+{
+   return IRExpr_Binop(op, a1, a2);
+}
+
+static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
+{
+   return IRExpr_Triop(op, a1, a2, a3);
+}
+
+static IRExpr* loadLE ( IRType ty, IRExpr* addr )
+{
+   return IRExpr_Load(Iend_LE, ty, addr);
+}
+
+/* Add a statement to the list held by "irbb". */
+static void stmt ( IRStmt* st )
+{
+   addStmtToIRSB( irsb, st );
+}
+
+static void assign ( IRTemp dst, IRExpr* e )
+{
+   stmt( IRStmt_WrTmp(dst, e) );
+}
+
+static void storeLE ( IRExpr* addr, IRExpr* data )
+{
+   stmt( IRStmt_Store(Iend_LE, addr, data) );
+}
+
+/* Generate a new temporary of the given type. */
+static IRTemp newTemp ( IRType ty )
+{
+   vassert(isPlausibleIRType(ty));
+   return newIRTemp( irsb->tyenv, ty );
+}
+
+/* Produces a value in 0 .. 3, which is encoded as per the type
+   IRRoundingMode. */
+static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
+{
+   return mkU32(Irrm_NEAREST);
+}
+
+/* Generate an expression for SRC rotated right by ROT. */
+static IRExpr* genROR32( IRTemp src, Int rot )
+{
+   vassert(rot >= 0 && rot < 32);
+   if (rot == 0)
+      return mkexpr(src);
+   return
+      binop(Iop_Or32,
+            binop(Iop_Shl32, mkexpr(src), mkU8(32 - rot)),
+            binop(Iop_Shr32, mkexpr(src), mkU8(rot)));
+}
+
+static IRExpr* mkU128 ( ULong i )
+{
+   return binop(Iop_64HLtoV128, mkU64(i), mkU64(i));
+}
+
+/* Generate a 4-aligned version of the given expression if
+   the given condition is true.  Else return it unchanged. */
+static IRExpr* align4if ( IRExpr* e, Bool b )
+{
+   if (b)
+      return binop(Iop_And32, e, mkU32(~3));
+   else
+      return e;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for accessing guest registers.               ---*/
+/*------------------------------------------------------------*/
+
+#define OFFB_R0       offsetof(VexGuestARMState,guest_R0)
+#define OFFB_R1       offsetof(VexGuestARMState,guest_R1)
+#define OFFB_R2       offsetof(VexGuestARMState,guest_R2)
+#define OFFB_R3       offsetof(VexGuestARMState,guest_R3)
+#define OFFB_R4       offsetof(VexGuestARMState,guest_R4)
+#define OFFB_R5       offsetof(VexGuestARMState,guest_R5)
+#define OFFB_R6       offsetof(VexGuestARMState,guest_R6)
+#define OFFB_R7       offsetof(VexGuestARMState,guest_R7)
+#define OFFB_R8       offsetof(VexGuestARMState,guest_R8)
+#define OFFB_R9       offsetof(VexGuestARMState,guest_R9)
+#define OFFB_R10      offsetof(VexGuestARMState,guest_R10)
+#define OFFB_R11      offsetof(VexGuestARMState,guest_R11)
+#define OFFB_R12      offsetof(VexGuestARMState,guest_R12)
+#define OFFB_R13      offsetof(VexGuestARMState,guest_R13)
+#define OFFB_R14      offsetof(VexGuestARMState,guest_R14)
+#define OFFB_R15T     offsetof(VexGuestARMState,guest_R15T)
+
+#define OFFB_CC_OP    offsetof(VexGuestARMState,guest_CC_OP)
+#define OFFB_CC_DEP1  offsetof(VexGuestARMState,guest_CC_DEP1)
+#define OFFB_CC_DEP2  offsetof(VexGuestARMState,guest_CC_DEP2)
+#define OFFB_CC_NDEP  offsetof(VexGuestARMState,guest_CC_NDEP)
+#define OFFB_NRADDR   offsetof(VexGuestARMState,guest_NRADDR)
+
+#define OFFB_D0       offsetof(VexGuestARMState,guest_D0)
+#define OFFB_D1       offsetof(VexGuestARMState,guest_D1)
+#define OFFB_D2       offsetof(VexGuestARMState,guest_D2)
+#define OFFB_D3       offsetof(VexGuestARMState,guest_D3)
+#define OFFB_D4       offsetof(VexGuestARMState,guest_D4)
+#define OFFB_D5       offsetof(VexGuestARMState,guest_D5)
+#define OFFB_D6       offsetof(VexGuestARMState,guest_D6)
+#define OFFB_D7       offsetof(VexGuestARMState,guest_D7)
+#define OFFB_D8       offsetof(VexGuestARMState,guest_D8)
+#define OFFB_D9       offsetof(VexGuestARMState,guest_D9)
+#define OFFB_D10      offsetof(VexGuestARMState,guest_D10)
+#define OFFB_D11      offsetof(VexGuestARMState,guest_D11)
+#define OFFB_D12      offsetof(VexGuestARMState,guest_D12)
+#define OFFB_D13      offsetof(VexGuestARMState,guest_D13)
+#define OFFB_D14      offsetof(VexGuestARMState,guest_D14)
+#define OFFB_D15      offsetof(VexGuestARMState,guest_D15)
+#define OFFB_D16      offsetof(VexGuestARMState,guest_D16)
+#define OFFB_D17      offsetof(VexGuestARMState,guest_D17)
+#define OFFB_D18      offsetof(VexGuestARMState,guest_D18)
+#define OFFB_D19      offsetof(VexGuestARMState,guest_D19)
+#define OFFB_D20      offsetof(VexGuestARMState,guest_D20)
+#define OFFB_D21      offsetof(VexGuestARMState,guest_D21)
+#define OFFB_D22      offsetof(VexGuestARMState,guest_D22)
+#define OFFB_D23      offsetof(VexGuestARMState,guest_D23)
+#define OFFB_D24      offsetof(VexGuestARMState,guest_D24)
+#define OFFB_D25      offsetof(VexGuestARMState,guest_D25)
+#define OFFB_D26      offsetof(VexGuestARMState,guest_D26)
+#define OFFB_D27      offsetof(VexGuestARMState,guest_D27)
+#define OFFB_D28      offsetof(VexGuestARMState,guest_D28)
+#define OFFB_D29      offsetof(VexGuestARMState,guest_D29)
+#define OFFB_D30      offsetof(VexGuestARMState,guest_D30)
+#define OFFB_D31      offsetof(VexGuestARMState,guest_D31)
+
+#define OFFB_FPSCR    offsetof(VexGuestARMState,guest_FPSCR)
+#define OFFB_TPIDRURO offsetof(VexGuestARMState,guest_TPIDRURO)
+#define OFFB_ITSTATE  offsetof(VexGuestARMState,guest_ITSTATE)
+#define OFFB_QFLAG32  offsetof(VexGuestARMState,guest_QFLAG32)
+#define OFFB_GEFLAG0  offsetof(VexGuestARMState,guest_GEFLAG0)
+#define OFFB_GEFLAG1  offsetof(VexGuestARMState,guest_GEFLAG1)
+#define OFFB_GEFLAG2  offsetof(VexGuestARMState,guest_GEFLAG2)
+#define OFFB_GEFLAG3  offsetof(VexGuestARMState,guest_GEFLAG3)
+
+
+/* ---------------- Integer registers ---------------- */
+
+static Int integerGuestRegOffset ( UInt iregNo )
+{
+   /* Do we care about endianness here?  We do if sub-parts of integer
+      registers are accessed, but I don't think that ever happens on
+      ARM. */
+   switch (iregNo) {
+      case 0:  return OFFB_R0;
+      case 1:  return OFFB_R1;
+      case 2:  return OFFB_R2;
+      case 3:  return OFFB_R3;
+      case 4:  return OFFB_R4;
+      case 5:  return OFFB_R5;
+      case 6:  return OFFB_R6;
+      case 7:  return OFFB_R7;
+      case 8:  return OFFB_R8;
+      case 9:  return OFFB_R9;
+      case 10: return OFFB_R10;
+      case 11: return OFFB_R11;
+      case 12: return OFFB_R12;
+      case 13: return OFFB_R13;
+      case 14: return OFFB_R14;
+      case 15: return OFFB_R15T;
+      default: vassert(0);
+   }
+}
+
+/* Plain ("low level") read from a reg; no +8 offset magic for r15. */
+static IRExpr* llGetIReg ( UInt iregNo )
+{
+   vassert(iregNo < 16);
+   return IRExpr_Get( integerGuestRegOffset(iregNo), Ity_I32 );
+}
+
+/* Architected read from a reg in ARM mode.  This automagically adds 8
+   to all reads of r15. */
+static IRExpr* getIRegA ( UInt iregNo )
+{
+   IRExpr* e;
+   ASSERT_IS_ARM;
+   vassert(iregNo < 16);
+   if (iregNo == 15) {
+      /* If asked for r15, don't read the guest state value, as that
+         may not be up to date in the case where loop unrolling has
+         happened, because the first insn's write to the block is
+         omitted; hence in the 2nd and subsequent unrollings we don't
+         have a correct value in guest r15.  Instead produce the
+         constant that we know would be produced at this point. */
+      vassert(0 == (guest_R15_curr_instr_notENC & 3));
+      e = mkU32(guest_R15_curr_instr_notENC + 8);
+   } else {
+      e = IRExpr_Get( integerGuestRegOffset(iregNo), Ity_I32 );
+   }
+   return e;
+}
+
+/* Architected read from a reg in Thumb mode.  This automagically adds
+   4 to all reads of r15. */
+static IRExpr* getIRegT ( UInt iregNo )
+{
+   IRExpr* e;
+   ASSERT_IS_THUMB;
+   vassert(iregNo < 16);
+   if (iregNo == 15) {
+      /* Ditto comment in getIReg. */
+      vassert(0 == (guest_R15_curr_instr_notENC & 1));
+      e = mkU32(guest_R15_curr_instr_notENC + 4);
+   } else {
+      e = IRExpr_Get( integerGuestRegOffset(iregNo), Ity_I32 );
+   }
+   return e;
+}
+
+/* Plain ("low level") write to a reg; no jump or alignment magic for
+   r15. */
+static void llPutIReg ( UInt iregNo, IRExpr* e )
+{
+   vassert(iregNo < 16);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
+   stmt( IRStmt_Put(integerGuestRegOffset(iregNo), e) );
+}
+
+/* Architected write to an integer register in ARM mode.  If it is to
+   r15, record info so at the end of this insn's translation, a branch
+   to it can be made.  Also handles conditional writes to the
+   register: if guardT == IRTemp_INVALID then the write is
+   unconditional.  If writing r15, also 4-align it. */
+static void putIRegA ( UInt       iregNo,
+                       IRExpr*    e,
+                       IRTemp     guardT /* :: Ity_I32, 0 or 1 */,
+                       IRJumpKind jk /* if a jump is generated */ )
+{
+   /* if writing r15, force e to be 4-aligned. */
+   // INTERWORKING FIXME.  this needs to be relaxed so that
+   // puts caused by LDMxx which load r15 interwork right.
+   // but is no aligned too relaxed?
+   //if (iregNo == 15)
+   //   e = binop(Iop_And32, e, mkU32(~3));
+   ASSERT_IS_ARM;
+   /* So, generate either an unconditional or a conditional write to
+      the reg. */
+   if (guardT == IRTemp_INVALID) {
+      /* unconditional write */
+      llPutIReg( iregNo, e );
+   } else {
+      llPutIReg( iregNo,
+                 IRExpr_Mux0X( unop(Iop_32to8, mkexpr(guardT)),
+                               llGetIReg(iregNo),
+                               e ));
+   }
+   if (iregNo == 15) {
+      // assert against competing r15 updates.  Shouldn't
+      // happen; should be ruled out by the instr matching
+      // logic.
+      vassert(r15written == False);
+      vassert(r15guard   == IRTemp_INVALID);
+      vassert(r15kind    == Ijk_Boring);
+      r15written = True;
+      r15guard   = guardT;
+      r15kind    = jk;
+   }
+}
+
+
+/* Architected write to an integer register in Thumb mode.  Writes to
+   r15 are not allowed.  Handles conditional writes to the register:
+   if guardT == IRTemp_INVALID then the write is unconditional. */
+static void putIRegT ( UInt       iregNo,
+                       IRExpr*    e,
+                       IRTemp     guardT /* :: Ity_I32, 0 or 1 */ )
+{
+   /* So, generate either an unconditional or a conditional write to
+      the reg. */
+   ASSERT_IS_THUMB;
+   vassert(iregNo >= 0 && iregNo <= 14);
+   if (guardT == IRTemp_INVALID) {
+      /* unconditional write */
+      llPutIReg( iregNo, e );
+   } else {
+      llPutIReg( iregNo,
+                 IRExpr_Mux0X( unop(Iop_32to8, mkexpr(guardT)),
+                               llGetIReg(iregNo),
+                               e ));
+   }
+}
+
+
+/* Thumb16 and Thumb32 only.
+   Returns true if reg is 13 or 15.  Implements the BadReg
+   predicate in the ARM ARM. */
+static Bool isBadRegT ( UInt r )
+{
+   vassert(r <= 15);
+   ASSERT_IS_THUMB;
+   return r == 13 || r == 15;
+}
+
+
+/* ---------------- Double registers ---------------- */
+
+static Int doubleGuestRegOffset ( UInt dregNo )
+{
+   /* Do we care about endianness here?  Probably do if we ever get
+      into the situation of dealing with the single-precision VFP
+      registers. */
+   switch (dregNo) {
+      case 0:  return OFFB_D0;
+      case 1:  return OFFB_D1;
+      case 2:  return OFFB_D2;
+      case 3:  return OFFB_D3;
+      case 4:  return OFFB_D4;
+      case 5:  return OFFB_D5;
+      case 6:  return OFFB_D6;
+      case 7:  return OFFB_D7;
+      case 8:  return OFFB_D8;
+      case 9:  return OFFB_D9;
+      case 10: return OFFB_D10;
+      case 11: return OFFB_D11;
+      case 12: return OFFB_D12;
+      case 13: return OFFB_D13;
+      case 14: return OFFB_D14;
+      case 15: return OFFB_D15;
+      case 16: return OFFB_D16;
+      case 17: return OFFB_D17;
+      case 18: return OFFB_D18;
+      case 19: return OFFB_D19;
+      case 20: return OFFB_D20;
+      case 21: return OFFB_D21;
+      case 22: return OFFB_D22;
+      case 23: return OFFB_D23;
+      case 24: return OFFB_D24;
+      case 25: return OFFB_D25;
+      case 26: return OFFB_D26;
+      case 27: return OFFB_D27;
+      case 28: return OFFB_D28;
+      case 29: return OFFB_D29;
+      case 30: return OFFB_D30;
+      case 31: return OFFB_D31;
+      default: vassert(0);
+   }
+}
+
+/* Plain ("low level") read from a VFP Dreg. */
+static IRExpr* llGetDReg ( UInt dregNo )
+{
+   vassert(dregNo < 32);
+   return IRExpr_Get( doubleGuestRegOffset(dregNo), Ity_F64 );
+}
+
+/* Architected read from a VFP Dreg. */
+static IRExpr* getDReg ( UInt dregNo ) {
+   return llGetDReg( dregNo );
+}
+
+/* Plain ("low level") write to a VFP Dreg. */
+static void llPutDReg ( UInt dregNo, IRExpr* e )
+{
+   vassert(dregNo < 32);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_F64);
+   stmt( IRStmt_Put(doubleGuestRegOffset(dregNo), e) );
+}
+
+/* Architected write to a VFP Dreg.  Handles conditional writes to the
+   register: if guardT == IRTemp_INVALID then the write is
+   unconditional. */
+static void putDReg ( UInt    dregNo,
+                      IRExpr* e,
+                      IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
+{
+   /* So, generate either an unconditional or a conditional write to
+      the reg. */
+   if (guardT == IRTemp_INVALID) {
+      /* unconditional write */
+      llPutDReg( dregNo, e );
+   } else {
+      llPutDReg( dregNo,
+                 IRExpr_Mux0X( unop(Iop_32to8, mkexpr(guardT)),
+                               llGetDReg(dregNo),
+                               e ));
+   }
+}
+
+/* And now exactly the same stuff all over again, but this time
+   taking/returning I64 rather than F64, to support 64-bit Neon
+   ops. */
+
+/* Plain ("low level") read from a Neon Integer Dreg. */
+static IRExpr* llGetDRegI64 ( UInt dregNo )
+{
+   vassert(dregNo < 32);
+   return IRExpr_Get( doubleGuestRegOffset(dregNo), Ity_I64 );
+}
+
+/* Architected read from a Neon Integer Dreg. */
+static IRExpr* getDRegI64 ( UInt dregNo ) {
+   return llGetDRegI64( dregNo );
+}
+
+/* Plain ("low level") write to a Neon Integer Dreg. */
+static void llPutDRegI64 ( UInt dregNo, IRExpr* e )
+{
+   vassert(dregNo < 32);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
+   stmt( IRStmt_Put(doubleGuestRegOffset(dregNo), e) );
+}
+
+/* Architected write to a Neon Integer Dreg.  Handles conditional
+   writes to the register: if guardT == IRTemp_INVALID then the write
+   is unconditional. */
+static void putDRegI64 ( UInt    dregNo,
+                         IRExpr* e,
+                         IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
+{
+   /* So, generate either an unconditional or a conditional write to
+      the reg. */
+   if (guardT == IRTemp_INVALID) {
+      /* unconditional write */
+      llPutDRegI64( dregNo, e );
+   } else {
+      llPutDRegI64( dregNo,
+                    IRExpr_Mux0X( unop(Iop_32to8, mkexpr(guardT)),
+                                  llGetDRegI64(dregNo),
+                                  e ));
+   }
+}
+
+/* ---------------- Quad registers ---------------- */
+
+static Int quadGuestRegOffset ( UInt qregNo )
+{
+   /* Do we care about endianness here?  Probably do if we ever get
+      into the situation of dealing with the 64 bit Neon registers. */
+   switch (qregNo) {
+      case 0:  return OFFB_D0;
+      case 1:  return OFFB_D2;
+      case 2:  return OFFB_D4;
+      case 3:  return OFFB_D6;
+      case 4:  return OFFB_D8;
+      case 5:  return OFFB_D10;
+      case 6:  return OFFB_D12;
+      case 7:  return OFFB_D14;
+      case 8:  return OFFB_D16;
+      case 9:  return OFFB_D18;
+      case 10: return OFFB_D20;
+      case 11: return OFFB_D22;
+      case 12: return OFFB_D24;
+      case 13: return OFFB_D26;
+      case 14: return OFFB_D28;
+      case 15: return OFFB_D30;
+      default: vassert(0);
+   }
+}
+
+/* Plain ("low level") read from a Neon Qreg. */
+static IRExpr* llGetQReg ( UInt qregNo )
+{
+   vassert(qregNo < 16);
+   return IRExpr_Get( quadGuestRegOffset(qregNo), Ity_V128 );
+}
+
+/* Architected read from a Neon Qreg. */
+static IRExpr* getQReg ( UInt qregNo ) {
+   return llGetQReg( qregNo );
+}
+
+/* Plain ("low level") write to a Neon Qreg. */
+static void llPutQReg ( UInt qregNo, IRExpr* e )
+{
+   vassert(qregNo < 16);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_V128);
+   stmt( IRStmt_Put(quadGuestRegOffset(qregNo), e) );
+}
+
+/* Architected write to a Neon Qreg.  Handles conditional writes to the
+   register: if guardT == IRTemp_INVALID then the write is
+   unconditional. */
+static void putQReg ( UInt    qregNo,
+                      IRExpr* e,
+                      IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
+{
+   /* So, generate either an unconditional or a conditional write to
+      the reg. */
+   if (guardT == IRTemp_INVALID) {
+      /* unconditional write */
+      llPutQReg( qregNo, e );
+   } else {
+      llPutQReg( qregNo,
+                 IRExpr_Mux0X( unop(Iop_32to8, mkexpr(guardT)),
+                               llGetQReg(qregNo),
+                               e ));
+   }
+}
+
+
+/* ---------------- Float registers ---------------- */
+
+static Int floatGuestRegOffset ( UInt fregNo )
+{
+   /* Start with the offset of the containing double, and then correct
+      for endianness.  Actually this is completely bogus and needs
+      careful thought. */
+   Int off;
+   vassert(fregNo < 32);
+   off = doubleGuestRegOffset(fregNo >> 1);
+   if (host_is_bigendian) {
+      vassert(0);
+   } else {
+      if (fregNo & 1)
+         off += 4;
+   }
+   return off;
+}
+
+/* Plain ("low level") read from a VFP Freg. */
+static IRExpr* llGetFReg ( UInt fregNo )
+{
+   vassert(fregNo < 32);
+   return IRExpr_Get( floatGuestRegOffset(fregNo), Ity_F32 );
+}
+
+/* Architected read from a VFP Freg. */
+static IRExpr* getFReg ( UInt fregNo ) {
+   return llGetFReg( fregNo );
+}
+
+/* Plain ("low level") write to a VFP Freg. */
+static void llPutFReg ( UInt fregNo, IRExpr* e )
+{
+   vassert(fregNo < 32);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_F32);
+   stmt( IRStmt_Put(floatGuestRegOffset(fregNo), e) );
+}
+
+/* Architected write to a VFP Freg.  Handles conditional writes to the
+   register: if guardT == IRTemp_INVALID then the write is
+   unconditional. */
+static void putFReg ( UInt    fregNo,
+                      IRExpr* e,
+                      IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
+{
+   /* So, generate either an unconditional or a conditional write to
+      the reg. */
+   if (guardT == IRTemp_INVALID) {
+      /* unconditional write */
+      llPutFReg( fregNo, e );
+   } else {
+      llPutFReg( fregNo,
+                 IRExpr_Mux0X( unop(Iop_32to8, mkexpr(guardT)),
+                               llGetFReg(fregNo),
+                               e ));
+   }
+}
+
+
+/* ---------------- Misc registers ---------------- */
+
+static void putMiscReg32 ( UInt    gsoffset, 
+                           IRExpr* e, /* :: Ity_I32 */
+                           IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
+{
+   switch (gsoffset) {
+      case OFFB_FPSCR:   break;
+      case OFFB_QFLAG32: break;
+      case OFFB_GEFLAG0: break;
+      case OFFB_GEFLAG1: break;
+      case OFFB_GEFLAG2: break;
+      case OFFB_GEFLAG3: break;
+      default: vassert(0); /* awaiting more cases */
+   }
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
+
+   if (guardT == IRTemp_INVALID) {
+      /* unconditional write */
+      stmt(IRStmt_Put(gsoffset, e));
+   } else {
+      stmt(IRStmt_Put(
+         gsoffset,
+         IRExpr_Mux0X( unop(Iop_32to8, mkexpr(guardT)),
+                       IRExpr_Get(gsoffset, Ity_I32),
+                       e
+         )
+      ));
+   }
+}
+
+static IRTemp get_ITSTATE ( void )
+{
+   ASSERT_IS_THUMB;
+   IRTemp t = newTemp(Ity_I32);
+   assign(t, IRExpr_Get( OFFB_ITSTATE, Ity_I32));
+   return t;
+}
+
+static void put_ITSTATE ( IRTemp t )
+{
+   ASSERT_IS_THUMB;
+   stmt( IRStmt_Put( OFFB_ITSTATE, mkexpr(t)) );
+}
+
+static IRTemp get_QFLAG32 ( void )
+{
+   IRTemp t = newTemp(Ity_I32);
+   assign(t, IRExpr_Get( OFFB_QFLAG32, Ity_I32));
+   return t;
+}
+
+static void put_QFLAG32 ( IRTemp t, IRTemp condT )
+{
+   putMiscReg32( OFFB_QFLAG32, mkexpr(t), condT );
+}
+
+/* Stickily set the 'Q' flag (APSR bit 27) of the APSR (Application Program
+   Status Register) to indicate that overflow or saturation occurred.
+   Nb: t must be zero to denote no saturation, and any nonzero
+   value to indicate saturation. */
+static void or_into_QFLAG32 ( IRExpr* e, IRTemp condT )
+{
+   IRTemp old = get_QFLAG32();
+   IRTemp nyu = newTemp(Ity_I32);
+   assign(nyu, binop(Iop_Or32, mkexpr(old), e) );
+   put_QFLAG32(nyu, condT);
+}
+
+/* Generate code to set APSR.GE[flagNo]. Each fn call sets 1 bit.
+   flagNo: which flag bit to set [3...0]
+   lowbits_to_ignore:  0 = look at all 32 bits
+                       8 = look at top 24 bits only
+                      16 = look at top 16 bits only
+                      31 = look at the top bit only
+   e: input value to be evaluated.
+   The new value is taken from 'e' with the lowest 'lowbits_to_ignore'
+   masked out.  If the resulting value is zero then the GE flag is
+   set to 0; any other value sets the flag to 1. */
+static void put_GEFLAG32 ( Int flagNo,            /* 0, 1, 2 or 3 */
+                           Int lowbits_to_ignore, /* 0, 8, 16 or 31   */
+                           IRExpr* e,             /* Ity_I32 */
+                           IRTemp condT )
+{
+   vassert( flagNo >= 0 && flagNo <= 3 );
+   vassert( lowbits_to_ignore == 0  || 
+            lowbits_to_ignore == 8  || 
+            lowbits_to_ignore == 16 ||
+            lowbits_to_ignore == 31 );
+   IRTemp masked = newTemp(Ity_I32);
+   assign(masked, binop(Iop_Shr32, e, mkU8(lowbits_to_ignore)));
+ 
+   switch (flagNo) {
+      case 0: putMiscReg32(OFFB_GEFLAG0, mkexpr(masked), condT); break;
+      case 1: putMiscReg32(OFFB_GEFLAG1, mkexpr(masked), condT); break;
+      case 2: putMiscReg32(OFFB_GEFLAG2, mkexpr(masked), condT); break;
+      case 3: putMiscReg32(OFFB_GEFLAG3, mkexpr(masked), condT); break;
+      default: vassert(0);
+   }
+}
+
+/* Return the (32-bit, zero-or-nonzero representation scheme) of
+   the specified GE flag. */
+static IRExpr* get_GEFLAG32( Int flagNo /* 0, 1, 2, 3 */ )
+{
+   switch (flagNo) {
+      case 0: return IRExpr_Get( OFFB_GEFLAG0, Ity_I32 );
+      case 1: return IRExpr_Get( OFFB_GEFLAG1, Ity_I32 );
+      case 2: return IRExpr_Get( OFFB_GEFLAG2, Ity_I32 );
+      case 3: return IRExpr_Get( OFFB_GEFLAG3, Ity_I32 );
+      default: vassert(0);
+   }
+}
+
+/* Set all 4 GE flags from the given 32-bit value as follows: GE 3 and
+   2 are set from bit 31 of the value, and GE 1 and 0 are set from bit
+   15 of the value.  All other bits are ignored. */
+static void set_GE_32_10_from_bits_31_15 ( IRTemp t32, IRTemp condT )
+{
+   IRTemp ge10 = newTemp(Ity_I32);
+   IRTemp ge32 = newTemp(Ity_I32);
+   assign(ge10, binop(Iop_And32, mkexpr(t32), mkU32(0x00008000)));
+   assign(ge32, binop(Iop_And32, mkexpr(t32), mkU32(0x80000000)));
+   put_GEFLAG32( 0, 0, mkexpr(ge10), condT );
+   put_GEFLAG32( 1, 0, mkexpr(ge10), condT );
+   put_GEFLAG32( 2, 0, mkexpr(ge32), condT );
+   put_GEFLAG32( 3, 0, mkexpr(ge32), condT );
+}
+
+
+/* Set all 4 GE flags from the given 32-bit value as follows: GE 3
+   from bit 31, GE 2 from bit 23, GE 1 from bit 15, and GE0 from
+   bit 7.  All other bits are ignored. */
+static void set_GE_3_2_1_0_from_bits_31_23_15_7 ( IRTemp t32, IRTemp condT )
+{
+   IRTemp ge0 = newTemp(Ity_I32);
+   IRTemp ge1 = newTemp(Ity_I32);
+   IRTemp ge2 = newTemp(Ity_I32);
+   IRTemp ge3 = newTemp(Ity_I32);
+   assign(ge0, binop(Iop_And32, mkexpr(t32), mkU32(0x00000080)));
+   assign(ge1, binop(Iop_And32, mkexpr(t32), mkU32(0x00008000)));
+   assign(ge2, binop(Iop_And32, mkexpr(t32), mkU32(0x00800000)));
+   assign(ge3, binop(Iop_And32, mkexpr(t32), mkU32(0x80000000)));
+   put_GEFLAG32( 0, 0, mkexpr(ge0), condT );
+   put_GEFLAG32( 1, 0, mkexpr(ge1), condT );
+   put_GEFLAG32( 2, 0, mkexpr(ge2), condT );
+   put_GEFLAG32( 3, 0, mkexpr(ge3), condT );
+}
+
+
+/* ---------------- FPSCR stuff ---------------- */
+
+/* Generate IR to get hold of the rounding mode bits in FPSCR, and
+   convert them to IR format.  Bind the final result to the
+   returned temp. */
+static IRTemp /* :: Ity_I32 */ mk_get_IR_rounding_mode ( void )
+{
+   /* The ARMvfp encoding for rounding mode bits is:
+         00  to nearest
+         01  to +infinity
+         10  to -infinity
+         11  to zero
+      We need to convert that to the IR encoding:
+         00  to nearest (the default)
+         10  to +infinity
+         01  to -infinity
+         11  to zero
+      Which can be done by swapping bits 0 and 1.
+      The rmode bits are at 23:22 in FPSCR.
+   */
+   IRTemp armEncd = newTemp(Ity_I32);
+   IRTemp swapped = newTemp(Ity_I32);
+   /* Fish FPSCR[23:22] out, and slide to bottom.  Doesn't matter that
+      we don't zero out bits 24 and above, since the assignment to
+      'swapped' will mask them out anyway. */
+   assign(armEncd,
+          binop(Iop_Shr32, IRExpr_Get(OFFB_FPSCR, Ity_I32), mkU8(22)));
+   /* Now swap them. */
+   assign(swapped,
+          binop(Iop_Or32,
+                binop(Iop_And32,
+                      binop(Iop_Shl32, mkexpr(armEncd), mkU8(1)),
+                      mkU32(2)),
+                binop(Iop_And32,
+                      binop(Iop_Shr32, mkexpr(armEncd), mkU8(1)),
+                      mkU32(1))
+         ));
+   return swapped;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for flag handling and conditional insns      ---*/
+/*------------------------------------------------------------*/
+
+static HChar* name_ARMCondcode ( ARMCondcode cond )
+{
+   switch (cond) {
+      case ARMCondEQ:  return "{eq}";
+      case ARMCondNE:  return "{ne}";
+      case ARMCondHS:  return "{hs}";  // or 'cs'
+      case ARMCondLO:  return "{lo}";  // or 'cc'
+      case ARMCondMI:  return "{mi}";
+      case ARMCondPL:  return "{pl}";
+      case ARMCondVS:  return "{vs}";
+      case ARMCondVC:  return "{vc}";
+      case ARMCondHI:  return "{hi}";
+      case ARMCondLS:  return "{ls}";
+      case ARMCondGE:  return "{ge}";
+      case ARMCondLT:  return "{lt}";
+      case ARMCondGT:  return "{gt}";
+      case ARMCondLE:  return "{le}";
+      case ARMCondAL:  return ""; // {al}: is the default
+      case ARMCondNV:  return "{nv}";
+      default: vpanic("name_ARMCondcode");
+   }
+}
+/* and a handy shorthand for it */
+static HChar* nCC ( ARMCondcode cond ) {
+   return name_ARMCondcode(cond);
+}
+
+
+/* Build IR to calculate some particular condition from stored
+   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
+   Ity_I32, suitable for narrowing.  Although the return type is
+   Ity_I32, the returned value is either 0 or 1.  'cond' must be
+   :: Ity_I32 and must denote the condition to compute in 
+   bits 7:4, and be zero everywhere else.
+*/
+static IRExpr* mk_armg_calculate_condition_dyn ( IRExpr* cond )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I32);
+   /* And 'cond' had better produce a value in which only bits 7:4
+      bits are nonzero.  However, obviously we can't assert for
+      that. */
+
+   /* So what we're constructing for the first argument is 
+      "(cond << 4) | stored-operation-operation".  However,
+      as per comments above, must be supplied pre-shifted to this
+      function.
+
+      This pairing scheme requires that the ARM_CC_OP_ values all fit
+      in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
+      8 bits of the first argument. */
+   IRExpr** args
+      = mkIRExprVec_4(
+           binop(Iop_Or32, IRExpr_Get(OFFB_CC_OP, Ity_I32), cond),
+           IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
+           IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
+           IRExpr_Get(OFFB_CC_NDEP, Ity_I32)
+        );
+   IRExpr* call
+      = mkIRExprCCall(
+           Ity_I32,
+           0/*regparm*/, 
+           "armg_calculate_condition", &armg_calculate_condition,
+           args
+        );
+
+   /* Exclude the requested condition, OP and NDEP from definedness
+      checking.  We're only interested in DEP1 and DEP2. */
+   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
+   return call;
+}
+
+
+/* Build IR to calculate some particular condition from stored
+   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
+   Ity_I32, suitable for narrowing.  Although the return type is
+   Ity_I32, the returned value is either 0 or 1.
+*/
+static IRExpr* mk_armg_calculate_condition ( ARMCondcode cond )
+{
+  /* First arg is "(cond << 4) | condition".  This requires that the
+     ARM_CC_OP_ values all fit in 4 bits.  Hence we are passing a
+     (COND, OP) pair in the lowest 8 bits of the first argument. */
+   vassert(cond >= 0 && cond <= 15);
+   return mk_armg_calculate_condition_dyn( mkU32(cond << 4) );
+}
+
+
+/* Build IR to calculate just the carry flag from stored
+   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
+   Ity_I32. */
+static IRExpr* mk_armg_calculate_flag_c ( void )
+{
+   IRExpr** args
+      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
+                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
+                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
+                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
+   IRExpr* call
+      = mkIRExprCCall(
+           Ity_I32,
+           0/*regparm*/, 
+           "armg_calculate_flag_c", &armg_calculate_flag_c,
+           args
+        );
+   /* Exclude OP and NDEP from definedness checking.  We're only
+      interested in DEP1 and DEP2. */
+   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
+   return call;
+}
+
+
+/* Build IR to calculate just the overflow flag from stored
+   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
+   Ity_I32. */
+static IRExpr* mk_armg_calculate_flag_v ( void )
+{
+   IRExpr** args
+      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
+                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
+                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
+                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
+   IRExpr* call
+      = mkIRExprCCall(
+           Ity_I32,
+           0/*regparm*/, 
+           "armg_calculate_flag_v", &armg_calculate_flag_v,
+           args
+        );
+   /* Exclude OP and NDEP from definedness checking.  We're only
+      interested in DEP1 and DEP2. */
+   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
+   return call;
+}
+
+
+/* Build IR to calculate N Z C V in bits 31:28 of the
+   returned word. */
+static IRExpr* mk_armg_calculate_flags_nzcv ( void )
+{
+   IRExpr** args
+      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
+                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
+                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
+                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
+   IRExpr* call
+      = mkIRExprCCall(
+           Ity_I32,
+           0/*regparm*/, 
+           "armg_calculate_flags_nzcv", &armg_calculate_flags_nzcv,
+           args
+        );
+   /* Exclude OP and NDEP from definedness checking.  We're only
+      interested in DEP1 and DEP2. */
+   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
+   return call;
+}
+
+static IRExpr* mk_armg_calculate_flag_qc ( IRExpr* resL, IRExpr* resR, Bool Q )
+{
+   IRExpr** args1;
+   IRExpr** args2;
+   IRExpr *call1, *call2, *res;
+
+   if (Q) {
+      args1 = mkIRExprVec_4 ( binop(Iop_GetElem32x4, resL, mkU8(0)),
+                              binop(Iop_GetElem32x4, resL, mkU8(1)),
+                              binop(Iop_GetElem32x4, resR, mkU8(0)),
+                              binop(Iop_GetElem32x4, resR, mkU8(1)) );
+      args2 = mkIRExprVec_4 ( binop(Iop_GetElem32x4, resL, mkU8(2)),
+                              binop(Iop_GetElem32x4, resL, mkU8(3)),
+                              binop(Iop_GetElem32x4, resR, mkU8(2)),
+                              binop(Iop_GetElem32x4, resR, mkU8(3)) );
+   } else {
+      args1 = mkIRExprVec_4 ( binop(Iop_GetElem32x2, resL, mkU8(0)),
+                              binop(Iop_GetElem32x2, resL, mkU8(1)),
+                              binop(Iop_GetElem32x2, resR, mkU8(0)),
+                              binop(Iop_GetElem32x2, resR, mkU8(1)) );
+   }
+
+#if 1
+   call1 = mkIRExprCCall(
+             Ity_I32,
+             0/*regparm*/, 
+             "armg_calculate_flag_qc", &armg_calculate_flag_qc,
+             args1
+          );
+   if (Q) {
+      call2 = mkIRExprCCall(
+                Ity_I32,
+                0/*regparm*/, 
+                "armg_calculate_flag_qc", &armg_calculate_flag_qc,
+                args2
+             );
+   }
+   if (Q) {
+      res = binop(Iop_Or32, call1, call2);
+   } else {
+      res = call1;
+   }
+#else
+   if (Q) {
+      res = unop(Iop_1Uto32,
+                 binop(Iop_CmpNE32,
+                       binop(Iop_Or32,
+                             binop(Iop_Or32,
+                                   binop(Iop_Xor32,
+                                         args1[0],
+                                         args1[2]),
+                                   binop(Iop_Xor32,
+                                         args1[1],
+                                         args1[3])),
+                             binop(Iop_Or32,
+                                   binop(Iop_Xor32,
+                                         args2[0],
+                                         args2[2]),
+                                   binop(Iop_Xor32,
+                                         args2[1],
+                                         args2[3]))),
+                       mkU32(0)));
+   } else {
+      res = unop(Iop_1Uto32,
+                 binop(Iop_CmpNE32,
+                       binop(Iop_Or32,
+                             binop(Iop_Xor32,
+                                   args1[0],
+                                   args1[2]),
+                             binop(Iop_Xor32,
+                                   args1[1],
+                                   args1[3])),
+                       mkU32(0)));
+   }
+#endif
+   return res;
+}
+
+// FIXME: this is named wrongly .. looks like a sticky set of
+// QC, not a write to it.
+static void setFlag_QC ( IRExpr* resL, IRExpr* resR, Bool Q,
+                         IRTemp condT )
+{
+   putMiscReg32 (OFFB_FPSCR,
+                 binop(Iop_Or32,
+                       IRExpr_Get(OFFB_FPSCR, Ity_I32),
+                       binop(Iop_Shl32,
+                             mk_armg_calculate_flag_qc(resL, resR, Q),
+                             mkU8(27))),
+                 condT);
+}
+
+/* Build IR to conditionally set the flags thunk.  As with putIReg, if
+   guard is IRTemp_INVALID then it's unconditional, else it holds a
+   condition :: Ity_I32. */
+static
+void setFlags_D1_D2_ND ( UInt cc_op, IRTemp t_dep1,
+                         IRTemp t_dep2, IRTemp t_ndep,
+                         IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
+{
+   IRTemp c8;
+   vassert(typeOfIRTemp(irsb->tyenv, t_dep1 == Ity_I32));
+   vassert(typeOfIRTemp(irsb->tyenv, t_dep2 == Ity_I32));
+   vassert(typeOfIRTemp(irsb->tyenv, t_ndep == Ity_I32));
+   vassert(cc_op >= ARMG_CC_OP_COPY && cc_op < ARMG_CC_OP_NUMBER);
+   if (guardT == IRTemp_INVALID) {
+      /* unconditional */
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(cc_op) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t_dep1) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(t_dep2) ));
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(t_ndep) ));
+   } else {
+      /* conditional */
+      c8 = newTemp(Ity_I8);
+      assign( c8, unop(Iop_32to8, mkexpr(guardT)) );
+      stmt( IRStmt_Put(
+               OFFB_CC_OP,
+               IRExpr_Mux0X( mkexpr(c8),
+                             IRExpr_Get(OFFB_CC_OP, Ity_I32),
+                             mkU32(cc_op) )));
+      stmt( IRStmt_Put(
+               OFFB_CC_DEP1,
+               IRExpr_Mux0X( mkexpr(c8),
+                             IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
+                             mkexpr(t_dep1) )));
+      stmt( IRStmt_Put(
+               OFFB_CC_DEP2,
+               IRExpr_Mux0X( mkexpr(c8),
+                             IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
+                             mkexpr(t_dep2) )));
+      stmt( IRStmt_Put(
+               OFFB_CC_NDEP,
+               IRExpr_Mux0X( mkexpr(c8),
+                             IRExpr_Get(OFFB_CC_NDEP, Ity_I32),
+                             mkexpr(t_ndep) )));
+   }
+}
+
+
+/* Minor variant of the above that sets NDEP to zero (if it
+   sets it at all) */
+static void setFlags_D1_D2 ( UInt cc_op, IRTemp t_dep1,
+                             IRTemp t_dep2,
+                             IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
+{
+   IRTemp z32 = newTemp(Ity_I32);
+   assign( z32, mkU32(0) );
+   setFlags_D1_D2_ND( cc_op, t_dep1, t_dep2, z32, guardT );
+}
+
+
+/* Minor variant of the above that sets DEP2 to zero (if it
+   sets it at all) */
+static void setFlags_D1_ND ( UInt cc_op, IRTemp t_dep1,
+                             IRTemp t_ndep,
+                             IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
+{
+   IRTemp z32 = newTemp(Ity_I32);
+   assign( z32, mkU32(0) );
+   setFlags_D1_D2_ND( cc_op, t_dep1, z32, t_ndep, guardT );
+}
+
+
+/* Minor variant of the above that sets DEP2 and NDEP to zero (if it
+   sets them at all) */
+static void setFlags_D1 ( UInt cc_op, IRTemp t_dep1,
+                          IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
+{
+   IRTemp z32 = newTemp(Ity_I32);
+   assign( z32, mkU32(0) );
+   setFlags_D1_D2_ND( cc_op, t_dep1, z32, z32, guardT );
+}
+
+
+/* ARM only */
+/* Generate a side-exit to the next instruction, if the given guard
+   expression :: Ity_I32 is 0 (note!  the side exit is taken if the
+   condition is false!)  This is used to skip over conditional
+   instructions which we can't generate straight-line code for, either
+   because they are too complex or (more likely) they potentially
+   generate exceptions.
+*/
+static void mk_skip_over_A32_if_cond_is_false ( 
+               IRTemp guardT /* :: Ity_I32, 0 or 1 */
+            )
+{
+   ASSERT_IS_ARM;
+   vassert(guardT != IRTemp_INVALID);
+   vassert(0 == (guest_R15_curr_instr_notENC & 3));
+   stmt( IRStmt_Exit(
+            unop(Iop_Not1, unop(Iop_32to1, mkexpr(guardT))),
+            Ijk_Boring,
+            IRConst_U32(toUInt(guest_R15_curr_instr_notENC + 4))
+       ));
+}
+
+/* Thumb16 only */
+/* ditto, but jump over a 16-bit thumb insn */
+static void mk_skip_over_T16_if_cond_is_false ( 
+               IRTemp guardT /* :: Ity_I32, 0 or 1 */
+            )
+{
+   ASSERT_IS_THUMB;
+   vassert(guardT != IRTemp_INVALID);
+   vassert(0 == (guest_R15_curr_instr_notENC & 1));
+   stmt( IRStmt_Exit(
+            unop(Iop_Not1, unop(Iop_32to1, mkexpr(guardT))),
+            Ijk_Boring,
+            IRConst_U32(toUInt((guest_R15_curr_instr_notENC + 2) | 1))
+       ));
+}
+
+
+/* Thumb32 only */
+/* ditto, but jump over a 32-bit thumb insn */
+static void mk_skip_over_T32_if_cond_is_false ( 
+               IRTemp guardT /* :: Ity_I32, 0 or 1 */
+            )
+{
+   ASSERT_IS_THUMB;
+   vassert(guardT != IRTemp_INVALID);
+   vassert(0 == (guest_R15_curr_instr_notENC & 1));
+   stmt( IRStmt_Exit(
+            unop(Iop_Not1, unop(Iop_32to1, mkexpr(guardT))),
+            Ijk_Boring,
+            IRConst_U32(toUInt((guest_R15_curr_instr_notENC + 4) | 1))
+       ));
+}
+
+
+/* Thumb16 and Thumb32 only
+   Generate a SIGILL followed by a restart of the current instruction
+   if the given temp is nonzero. */
+static void gen_SIGILL_T_if_nonzero ( IRTemp t /* :: Ity_I32 */ )
+{
+   ASSERT_IS_THUMB;
+   vassert(t != IRTemp_INVALID);
+   vassert(0 == (guest_R15_curr_instr_notENC & 1));
+   stmt(
+      IRStmt_Exit(
+         binop(Iop_CmpNE32, mkexpr(t), mkU32(0)),
+         Ijk_NoDecode,
+         IRConst_U32(toUInt(guest_R15_curr_instr_notENC | 1))
+      )
+   );
+}
+
+
+/* Inspect the old_itstate, and generate a SIGILL if it indicates that
+   we are currently in an IT block and are not the last in the block.
+   This also rolls back guest_ITSTATE to its old value before the exit
+   and restores it to its new value afterwards.  This is so that if
+   the exit is taken, we have an up to date version of ITSTATE
+   available.  Without doing that, we have no hope of making precise
+   exceptions work. */
+static void gen_SIGILL_T_if_in_but_NLI_ITBlock (
+               IRTemp old_itstate /* :: Ity_I32 */,
+               IRTemp new_itstate /* :: Ity_I32 */
+            )
+{
+   ASSERT_IS_THUMB;
+   put_ITSTATE(old_itstate); // backout
+   IRTemp guards_for_next3 = newTemp(Ity_I32);
+   assign(guards_for_next3,
+          binop(Iop_Shr32, mkexpr(old_itstate), mkU8(8)));
+   gen_SIGILL_T_if_nonzero(guards_for_next3);
+   put_ITSTATE(new_itstate); //restore
+}
+
+
+/* Simpler version of the above, which generates a SIGILL if
+   we're anywhere within an IT block. */
+static void gen_SIGILL_T_if_in_ITBlock (
+               IRTemp old_itstate /* :: Ity_I32 */,
+               IRTemp new_itstate /* :: Ity_I32 */
+            )
+{
+   put_ITSTATE(old_itstate); // backout
+   gen_SIGILL_T_if_nonzero(old_itstate);
+   put_ITSTATE(new_itstate); //restore
+}
+
+
+/* Generate an APSR value, from the NZCV thunk, and
+   from QFLAG32 and GEFLAG0 .. GEFLAG3. */
+static IRTemp synthesise_APSR ( void )
+{
+   IRTemp res1 = newTemp(Ity_I32);
+   // Get NZCV
+   assign( res1, mk_armg_calculate_flags_nzcv() );
+   // OR in the Q value
+   IRTemp res2 = newTemp(Ity_I32);
+   assign(
+      res2,
+      binop(Iop_Or32,
+            mkexpr(res1),
+            binop(Iop_Shl32,
+                  unop(Iop_1Uto32,
+                       binop(Iop_CmpNE32,
+                             mkexpr(get_QFLAG32()),
+                             mkU32(0))),
+                  mkU8(ARMG_CC_SHIFT_Q)))
+   );
+   // OR in GE0 .. GE3
+   IRExpr* ge0
+      = unop(Iop_1Uto32, binop(Iop_CmpNE32, get_GEFLAG32(0), mkU32(0)));
+   IRExpr* ge1
+      = unop(Iop_1Uto32, binop(Iop_CmpNE32, get_GEFLAG32(1), mkU32(0)));
+   IRExpr* ge2
+      = unop(Iop_1Uto32, binop(Iop_CmpNE32, get_GEFLAG32(2), mkU32(0)));
+   IRExpr* ge3
+      = unop(Iop_1Uto32, binop(Iop_CmpNE32, get_GEFLAG32(3), mkU32(0)));
+   IRTemp res3 = newTemp(Ity_I32);
+   assign(res3,
+          binop(Iop_Or32,
+                mkexpr(res2),
+                binop(Iop_Or32,
+                      binop(Iop_Or32,
+                            binop(Iop_Shl32, ge0, mkU8(16)),
+                            binop(Iop_Shl32, ge1, mkU8(17))),
+                      binop(Iop_Or32,
+                            binop(Iop_Shl32, ge2, mkU8(18)),
+                            binop(Iop_Shl32, ge3, mkU8(19))) )));
+   return res3;
+}
+
+
+/* and the inverse transformation: given an APSR value,
+   set the NZCV thunk, the Q flag, and the GE flags. */
+static void desynthesise_APSR ( Bool write_nzcvq, Bool write_ge,
+                                IRTemp apsrT, IRTemp condT )
+{
+   vassert(write_nzcvq || write_ge);
+   if (write_nzcvq) {
+      // Do NZCV
+      IRTemp immT = newTemp(Ity_I32);
+      assign(immT, binop(Iop_And32, mkexpr(apsrT), mkU32(0xF0000000)) );
+      setFlags_D1(ARMG_CC_OP_COPY, immT, condT);
+      // Do Q
+      IRTemp qnewT = newTemp(Ity_I32);
+      assign(qnewT, binop(Iop_And32, mkexpr(apsrT), mkU32(ARMG_CC_MASK_Q)));
+      put_QFLAG32(qnewT, condT);
+   }
+   if (write_ge) {
+      // Do GE3..0
+      put_GEFLAG32(0, 0, binop(Iop_And32, mkexpr(apsrT), mkU32(1<<16)),
+                   condT);
+      put_GEFLAG32(1, 0, binop(Iop_And32, mkexpr(apsrT), mkU32(1<<17)),
+                   condT);
+      put_GEFLAG32(2, 0, binop(Iop_And32, mkexpr(apsrT), mkU32(1<<18)),
+                   condT);
+      put_GEFLAG32(3, 0, binop(Iop_And32, mkexpr(apsrT), mkU32(1<<19)),
+                   condT);
+   }
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for saturation                               ---*/
+/*------------------------------------------------------------*/
+
+/* FIXME: absolutely the only diff. between (a) armUnsignedSatQ and
+   (b) armSignedSatQ is that in (a) the floor is set to 0, whereas in
+   (b) the floor is computed from the value of imm5.  these two fnsn
+   should be commoned up. */
+
+/* UnsignedSatQ(): 'clamp' each value so it lies between 0 <= x <= (2^N)-1
+   Optionally return flag resQ saying whether saturation occurred.
+   See definition in manual, section A2.2.1, page 41
+   (bits(N), boolean) UnsignedSatQ( integer i, integer N )
+   {
+     if ( i > (2^N)-1 ) { result = (2^N)-1; saturated = TRUE; }
+     elsif ( i < 0 )    { result = 0; saturated = TRUE; }
+     else               { result = i; saturated = FALSE; }
+     return ( result<N-1:0>, saturated );
+   }
+*/
+static void armUnsignedSatQ( IRTemp* res,  /* OUT - Ity_I32 */
+                             IRTemp* resQ, /* OUT - Ity_I32  */
+                             IRTemp regT,  /* value to clamp - Ity_I32 */
+                             UInt imm5 )   /* saturation ceiling */
+{
+   UInt ceil  = (1 << imm5) - 1;    // (2^imm5)-1
+   UInt floor = 0;
+
+   IRTemp node0 = newTemp(Ity_I32);
+   IRTemp node1 = newTemp(Ity_I32);
+   IRTemp node2 = newTemp(Ity_I1);
+   IRTemp node3 = newTemp(Ity_I32);
+   IRTemp node4 = newTemp(Ity_I32);
+   IRTemp node5 = newTemp(Ity_I1);
+   IRTemp node6 = newTemp(Ity_I32);
+
+   assign( node0, mkexpr(regT) );
+   assign( node1, mkU32(ceil) );
+   assign( node2, binop( Iop_CmpLT32S, mkexpr(node1), mkexpr(node0) ) );
+   assign( node3, IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(node2)),
+                                mkexpr(node0),
+                                mkexpr(node1) ) );
+   assign( node4, mkU32(floor) );
+   assign( node5, binop( Iop_CmpLT32S, mkexpr(node3), mkexpr(node4) ) );
+   assign( node6, IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(node5)),
+                                mkexpr(node3),
+                                mkexpr(node4) ) );
+   assign( *res, mkexpr(node6) );
+
+   /* if saturation occurred, then resQ is set to some nonzero value
+      if sat did not occur, resQ is guaranteed to be zero. */
+   if (resQ) {
+      assign( *resQ, binop(Iop_Xor32, mkexpr(*res), mkexpr(regT)) );
+   }
+}
+
+
+/* SignedSatQ(): 'clamp' each value so it lies between  -2^N <= x <= (2^N) - 1
+   Optionally return flag resQ saying whether saturation occurred.
+   - see definition in manual, section A2.2.1, page 41
+   (bits(N), boolean ) SignedSatQ( integer i, integer N ) 
+   {
+     if ( i > 2^(N-1) - 1 )    { result = 2^(N-1) - 1; saturated = TRUE; }
+     elsif ( i < -(2^(N-1)) )  { result = -(2^(N-1));  saturated = FALSE; }
+     else                      { result = i;           saturated = FALSE; }
+     return ( result[N-1:0], saturated );
+   }
+*/
+static void armSignedSatQ( IRTemp regT,    /* value to clamp - Ity_I32 */
+                           UInt imm5,      /* saturation ceiling */
+                           IRTemp* res,    /* OUT - Ity_I32 */
+                           IRTemp* resQ )  /* OUT - Ity_I32  */
+{
+   Int ceil  =  (1 << (imm5-1)) - 1;  //  (2^(imm5-1))-1
+   Int floor = -(1 << (imm5-1));      // -(2^(imm5-1))
+
+   IRTemp node0 = newTemp(Ity_I32);
+   IRTemp node1 = newTemp(Ity_I32);
+   IRTemp node2 = newTemp(Ity_I1);
+   IRTemp node3 = newTemp(Ity_I32);
+   IRTemp node4 = newTemp(Ity_I32);
+   IRTemp node5 = newTemp(Ity_I1);
+   IRTemp node6 = newTemp(Ity_I32);
+
+   assign( node0, mkexpr(regT) );
+   assign( node1, mkU32(ceil) );
+   assign( node2, binop( Iop_CmpLT32S, mkexpr(node1), mkexpr(node0) ) );
+   assign( node3, IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(node2)),
+                                mkexpr(node0),  mkexpr(node1) ) );
+   assign( node4, mkU32(floor) );
+   assign( node5, binop( Iop_CmpLT32S, mkexpr(node3), mkexpr(node4) ) );
+   assign( node6, IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(node5)),
+                                mkexpr(node3),  mkexpr(node4) ) );
+   assign( *res, mkexpr(node6) );
+
+   /* if saturation occurred, then resQ is set to some nonzero value
+      if sat did not occur, resQ is guaranteed to be zero. */
+   if (resQ) {
+     assign( *resQ, binop(Iop_Xor32, mkexpr(*res), mkexpr(regT)) );
+   }
+}
+
+
+/* Compute a value 0 :: I32 or 1 :: I32, indicating whether signed
+   overflow occurred for 32-bit addition.  Needs both args and the
+   result.  HD p27. */
+static
+IRExpr* signed_overflow_after_Add32 ( IRExpr* resE,
+                                      IRTemp argL, IRTemp argR )
+{
+   IRTemp res = newTemp(Ity_I32);
+   assign(res, resE);
+   return
+      binop( Iop_Shr32, 
+             binop( Iop_And32,
+                    binop( Iop_Xor32, mkexpr(res), mkexpr(argL) ),
+                    binop( Iop_Xor32, mkexpr(res), mkexpr(argR) )), 
+             mkU8(31) );
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Larger helpers                                       ---*/
+/*------------------------------------------------------------*/
+
+/* Compute both the result and new C flag value for a LSL by an imm5
+   or by a register operand.  May generate reads of the old C value
+   (hence only safe to use before any writes to guest state happen).
+   Are factored out so can be used by both ARM and Thumb.
+
+   Note that in compute_result_and_C_after_{LSL,LSR,ASR}_by{imm5,reg},
+   "res" (the result)  is a.k.a. "shop", shifter operand
+   "newC" (the new C)  is a.k.a. "shco", shifter carry out
+
+   The calling convention for res and newC is a bit funny.  They could
+   be passed by value, but instead are passed by ref.
+*/
+
+static void compute_result_and_C_after_LSL_by_imm5 (
+               /*OUT*/HChar* buf,
+               IRTemp* res,
+               IRTemp* newC,
+               IRTemp rMt, UInt shift_amt, /* operands */
+               UInt rM      /* only for debug printing */
+            )
+{
+   if (shift_amt == 0) {
+      if (newC) {
+         assign( *newC, mk_armg_calculate_flag_c() );
+      }
+      assign( *res, mkexpr(rMt) );
+      DIS(buf, "r%u", rM);
+   } else {
+      vassert(shift_amt >= 1 && shift_amt <= 31);
+      if (newC) {
+         assign( *newC,
+                 binop(Iop_And32,
+                       binop(Iop_Shr32, mkexpr(rMt), 
+                                        mkU8(32 - shift_amt)),
+                       mkU32(1)));
+      }
+      assign( *res,
+              binop(Iop_Shl32, mkexpr(rMt), mkU8(shift_amt)) );
+      DIS(buf, "r%u, LSL #%u", rM, shift_amt);
+   }
+}
+
+
+static void compute_result_and_C_after_LSL_by_reg (
+               /*OUT*/HChar* buf,
+               IRTemp* res,
+               IRTemp* newC,
+               IRTemp rMt, IRTemp rSt,  /* operands */
+               UInt rM,    UInt rS      /* only for debug printing */
+            )
+{
+   // shift left in range 0 .. 255
+   // amt  = rS & 255
+   // res  = amt < 32 ?  Rm << amt  : 0
+   // newC = amt == 0     ? oldC  :
+   //        amt in 1..32 ?  Rm[32-amt]  : 0
+   IRTemp amtT = newTemp(Ity_I32);
+   assign( amtT, binop(Iop_And32, mkexpr(rSt), mkU32(255)) );
+   if (newC) {
+      /* mux0X(amt == 0,
+               mux0X(amt < 32, 
+                     0,
+                     Rm[(32-amt) & 31])
+               oldC)
+      */
+      /* About the best you can do is pray that iropt is able
+         to nuke most or all of the following junk. */
+      IRTemp oldC = newTemp(Ity_I32);
+      assign(oldC, mk_armg_calculate_flag_c() );
+      assign(
+         *newC,
+         IRExpr_Mux0X(
+            unop(Iop_1Uto8,
+                 binop(Iop_CmpEQ32, mkexpr(amtT), mkU32(0))),
+            IRExpr_Mux0X(
+               unop(Iop_1Uto8,
+                    binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))),
+               mkU32(0),
+               binop(Iop_Shr32,
+                     mkexpr(rMt),
+                     unop(Iop_32to8,
+                          binop(Iop_And32,
+                                binop(Iop_Sub32,
+                                      mkU32(32),
+                                      mkexpr(amtT)),
+                                mkU32(31)
+                          )
+                     )
+               )
+            ),
+            mkexpr(oldC)
+         )
+      );
+   }
+   // (Rm << (Rs & 31))  &  (((Rs & 255) - 32) >>s 31)
+   // Lhs of the & limits the shift to 31 bits, so as to
+   // give known IR semantics.  Rhs of the & is all 1s for
+   // Rs <= 31 and all 0s for Rs >= 32.
+   assign(
+      *res,
+      binop(
+         Iop_And32,
+         binop(Iop_Shl32,
+               mkexpr(rMt),
+               unop(Iop_32to8,
+                    binop(Iop_And32, mkexpr(rSt), mkU32(31)))),
+         binop(Iop_Sar32,
+               binop(Iop_Sub32,
+                     mkexpr(amtT),
+                     mkU32(32)),
+               mkU8(31))));
+    DIS(buf, "r%u, LSL r%u", rM, rS);
+}
+
+
+static void compute_result_and_C_after_LSR_by_imm5 (
+               /*OUT*/HChar* buf,
+               IRTemp* res,
+               IRTemp* newC,
+               IRTemp rMt, UInt shift_amt, /* operands */
+               UInt rM      /* only for debug printing */
+            )
+{
+   if (shift_amt == 0) {
+      // conceptually a 32-bit shift, however:
+      // res  = 0
+      // newC = Rm[31]
+      if (newC) {
+         assign( *newC,
+                 binop(Iop_And32,
+                       binop(Iop_Shr32, mkexpr(rMt), mkU8(31)), 
+                       mkU32(1)));
+      }
+      assign( *res, mkU32(0) );
+      DIS(buf, "r%u, LSR #0(a.k.a. 32)", rM);
+   } else {
+      // shift in range 1..31
+      // res  = Rm >>u shift_amt
+      // newC = Rm[shift_amt - 1]
+      vassert(shift_amt >= 1 && shift_amt <= 31);
+      if (newC) {
+         assign( *newC,
+                 binop(Iop_And32,
+                       binop(Iop_Shr32, mkexpr(rMt), 
+                                        mkU8(shift_amt - 1)),
+                       mkU32(1)));
+      }
+      assign( *res,
+              binop(Iop_Shr32, mkexpr(rMt), mkU8(shift_amt)) );
+      DIS(buf, "r%u, LSR #%u", rM, shift_amt);
+   }
+}
+
+
+static void compute_result_and_C_after_LSR_by_reg (
+               /*OUT*/HChar* buf,
+               IRTemp* res,
+               IRTemp* newC,
+               IRTemp rMt, IRTemp rSt,  /* operands */
+               UInt rM,    UInt rS      /* only for debug printing */
+            )
+{
+   // shift right in range 0 .. 255
+   // amt = rS & 255
+   // res  = amt < 32 ?  Rm >>u amt  : 0
+   // newC = amt == 0     ? oldC  :
+   //        amt in 1..32 ?  Rm[amt-1]  : 0
+   IRTemp amtT = newTemp(Ity_I32);
+   assign( amtT, binop(Iop_And32, mkexpr(rSt), mkU32(255)) );
+   if (newC) {
+      /* mux0X(amt == 0,
+               mux0X(amt < 32, 
+                     0,
+                     Rm[(amt-1) & 31])
+               oldC)
+      */
+      IRTemp oldC = newTemp(Ity_I32);
+      assign(oldC, mk_armg_calculate_flag_c() );
+      assign(
+         *newC,
+         IRExpr_Mux0X(
+            unop(Iop_1Uto8,
+                 binop(Iop_CmpEQ32, mkexpr(amtT), mkU32(0))),
+            IRExpr_Mux0X(
+               unop(Iop_1Uto8,
+                    binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))),
+               mkU32(0),
+               binop(Iop_Shr32,
+                     mkexpr(rMt),
+                     unop(Iop_32to8,
+                          binop(Iop_And32,
+                                binop(Iop_Sub32,
+                                      mkexpr(amtT),
+                                      mkU32(1)),
+                                mkU32(31)
+                          )
+                     )
+               )
+            ),
+            mkexpr(oldC)
+         )
+      );
+   }
+   // (Rm >>u (Rs & 31))  &  (((Rs & 255) - 32) >>s 31)
+   // Lhs of the & limits the shift to 31 bits, so as to
+   // give known IR semantics.  Rhs of the & is all 1s for
+   // Rs <= 31 and all 0s for Rs >= 32.
+   assign(
+      *res,
+      binop(
+         Iop_And32,
+         binop(Iop_Shr32,
+               mkexpr(rMt),
+               unop(Iop_32to8,
+                    binop(Iop_And32, mkexpr(rSt), mkU32(31)))),
+         binop(Iop_Sar32,
+               binop(Iop_Sub32,
+                     mkexpr(amtT),
+                     mkU32(32)),
+               mkU8(31))));
+    DIS(buf, "r%u, LSR r%u", rM, rS);
+}
+
+
+static void compute_result_and_C_after_ASR_by_imm5 (
+               /*OUT*/HChar* buf,
+               IRTemp* res,
+               IRTemp* newC,
+               IRTemp rMt, UInt shift_amt, /* operands */
+               UInt rM      /* only for debug printing */
+            )
+{
+   if (shift_amt == 0) {
+      // conceptually a 32-bit shift, however:
+      // res  = Rm >>s 31
+      // newC = Rm[31]
+      if (newC) {
+         assign( *newC,
+                 binop(Iop_And32,
+                       binop(Iop_Shr32, mkexpr(rMt), mkU8(31)), 
+                       mkU32(1)));
+      }
+      assign( *res, binop(Iop_Sar32, mkexpr(rMt), mkU8(31)) );
+      DIS(buf, "r%u, ASR #0(a.k.a. 32)", rM);
+   } else {
+      // shift in range 1..31
+      // res = Rm >>s shift_amt
+      // newC = Rm[shift_amt - 1]
+      vassert(shift_amt >= 1 && shift_amt <= 31);
+      if (newC) {
+         assign( *newC,
+                 binop(Iop_And32,
+                       binop(Iop_Shr32, mkexpr(rMt), 
+                                        mkU8(shift_amt - 1)),
+                       mkU32(1)));
+      }
+      assign( *res,
+              binop(Iop_Sar32, mkexpr(rMt), mkU8(shift_amt)) );
+      DIS(buf, "r%u, ASR #%u", rM, shift_amt);
+   }
+}
+
+
+static void compute_result_and_C_after_ASR_by_reg (
+               /*OUT*/HChar* buf,
+               IRTemp* res,
+               IRTemp* newC,
+               IRTemp rMt, IRTemp rSt,  /* operands */
+               UInt rM,    UInt rS      /* only for debug printing */
+            )
+{
+   // arithmetic shift right in range 0 .. 255
+   // amt = rS & 255
+   // res  = amt < 32 ?  Rm >>s amt  : Rm >>s 31
+   // newC = amt == 0     ? oldC  :
+   //        amt in 1..32 ?  Rm[amt-1]  : Rm[31]
+   IRTemp amtT = newTemp(Ity_I32);
+   assign( amtT, binop(Iop_And32, mkexpr(rSt), mkU32(255)) );
+   if (newC) {
+      /* mux0X(amt == 0,
+               mux0X(amt < 32, 
+                     Rm[31],
+                     Rm[(amt-1) & 31])
+               oldC)
+      */
+      IRTemp oldC = newTemp(Ity_I32);
+      assign(oldC, mk_armg_calculate_flag_c() );
+      assign(
+         *newC,
+         IRExpr_Mux0X(
+            unop(Iop_1Uto8,
+                 binop(Iop_CmpEQ32, mkexpr(amtT), mkU32(0))),
+            IRExpr_Mux0X(
+               unop(Iop_1Uto8,
+                    binop(Iop_CmpLE32U, mkexpr(amtT), mkU32(32))),
+               binop(Iop_Shr32,
+                     mkexpr(rMt),
+                     mkU8(31)
+               ),
+               binop(Iop_Shr32,
+                     mkexpr(rMt),
+                     unop(Iop_32to8,
+                          binop(Iop_And32,
+                                binop(Iop_Sub32,
+                                      mkexpr(amtT),
+                                      mkU32(1)),
+                                mkU32(31)
+                          )
+                     )
+               )
+            ),
+            mkexpr(oldC)
+         )
+      );
+   }
+   // (Rm >>s (amt <u 32 ? amt : 31))
+   assign(
+      *res,
+      binop(
+         Iop_Sar32,
+         mkexpr(rMt),
+         unop(
+            Iop_32to8,
+            IRExpr_Mux0X(
+               unop(
+                 Iop_1Uto8,
+                 binop(Iop_CmpLT32U, mkexpr(amtT), mkU32(32))),
+               mkU32(31),
+               mkexpr(amtT)))));
+    DIS(buf, "r%u, ASR r%u", rM, rS);
+}
+
+
+static void compute_result_and_C_after_ROR_by_reg (
+               /*OUT*/HChar* buf,
+               IRTemp* res,
+               IRTemp* newC,
+               IRTemp rMt, IRTemp rSt,  /* operands */
+               UInt rM,    UInt rS      /* only for debug printing */
+            )
+{
+   // rotate right in range 0 .. 255
+   // amt = rS & 255
+   // shop =  Rm `ror` (amt & 31)
+   // shco =  amt == 0 ? oldC : Rm[(amt-1) & 31]
+   IRTemp amtT = newTemp(Ity_I32);
+   assign( amtT, binop(Iop_And32, mkexpr(rSt), mkU32(255)) );
+   IRTemp amt5T = newTemp(Ity_I32);
+   assign( amt5T, binop(Iop_And32, mkexpr(rSt), mkU32(31)) );
+   IRTemp oldC = newTemp(Ity_I32);
+   assign(oldC, mk_armg_calculate_flag_c() );
+   if (newC) {
+      assign(
+         *newC,
+         IRExpr_Mux0X(
+            unop(Iop_32to8, mkexpr(amtT)),
+            mkexpr(oldC),
+            binop(Iop_And32,
+                  binop(Iop_Shr32,
+                        mkexpr(rMt), 
+                        unop(Iop_32to8,
+                             binop(Iop_And32,
+                                   binop(Iop_Sub32,
+                                         mkexpr(amtT), 
+                                         mkU32(1)
+                                   ),
+                                   mkU32(31)
+                             )
+                        )
+                  ),
+                  mkU32(1)
+            )
+         )
+      );
+   }
+   assign(
+      *res,
+      IRExpr_Mux0X(
+         unop(Iop_32to8, mkexpr(amt5T)), mkexpr(rMt),
+         binop(Iop_Or32,
+               binop(Iop_Shr32,
+                     mkexpr(rMt), 
+                     unop(Iop_32to8, mkexpr(amt5T))
+               ),
+               binop(Iop_Shl32,
+                     mkexpr(rMt),
+                     unop(Iop_32to8,
+                          binop(Iop_Sub32, mkU32(32), mkexpr(amt5T))
+                     )
+               )
+         )
+      )
+   );
+   DIS(buf, "r%u, ROR r#%u", rM, rS);
+}
+
+
+/* Generate an expression corresponding to the immediate-shift case of
+   a shifter operand.  This is used both for ARM and Thumb2.
+
+   Bind it to a temporary, and return that via *res.  If newC is
+   non-NULL, also compute a value for the shifter's carry out (in the
+   LSB of a word), bind it to a temporary, and return that via *shco.
+
+   Generates GETs from the guest state and is therefore not safe to
+   use once we start doing PUTs to it, for any given instruction.
+
+   'how' is encoded thusly:
+      00b LSL,  01b LSR,  10b ASR,  11b ROR
+   Most but not all ARM and Thumb integer insns use this encoding.
+   Be careful to ensure the right value is passed here.
+*/
+static void compute_result_and_C_after_shift_by_imm5 (
+               /*OUT*/HChar* buf,
+               /*OUT*/IRTemp* res,
+               /*OUT*/IRTemp* newC,
+               IRTemp  rMt,       /* reg to shift */
+               UInt    how,       /* what kind of shift */
+               UInt    shift_amt, /* shift amount (0..31) */
+               UInt    rM         /* only for debug printing */
+            )
+{
+   vassert(shift_amt < 32);
+   vassert(how < 4);
+
+   switch (how) {
+
+      case 0:
+         compute_result_and_C_after_LSL_by_imm5(
+            buf, res, newC, rMt, shift_amt, rM
+         );
+         break;
+
+      case 1:
+         compute_result_and_C_after_LSR_by_imm5(
+            buf, res, newC, rMt, shift_amt, rM
+         );
+         break;
+
+      case 2:
+         compute_result_and_C_after_ASR_by_imm5(
+            buf, res, newC, rMt, shift_amt, rM
+         );
+         break;
+
+      case 3:
+         if (shift_amt == 0) {
+            IRTemp oldcT = newTemp(Ity_I32);
+            // rotate right 1 bit through carry (?)
+            // RRX -- described at ARM ARM A5-17
+            // res  = (oldC << 31) | (Rm >>u 1)
+            // newC = Rm[0]
+            if (newC) {
+               assign( *newC,
+                       binop(Iop_And32, mkexpr(rMt), mkU32(1)));
+            }
+            assign( oldcT, mk_armg_calculate_flag_c() );
+            assign( *res, 
+                    binop(Iop_Or32,
+                          binop(Iop_Shl32, mkexpr(oldcT), mkU8(31)),
+                          binop(Iop_Shr32, mkexpr(rMt), mkU8(1))) );
+            DIS(buf, "r%u, RRX", rM);
+         } else {
+            // rotate right in range 1..31
+            // res  = Rm `ror` shift_amt
+            // newC = Rm[shift_amt - 1]
+            vassert(shift_amt >= 1 && shift_amt <= 31);
+            if (newC) {
+               assign( *newC,
+                       binop(Iop_And32,
+                             binop(Iop_Shr32, mkexpr(rMt), 
+                                              mkU8(shift_amt - 1)),
+                             mkU32(1)));
+            }
+            assign( *res,
+                    binop(Iop_Or32,
+                          binop(Iop_Shr32, mkexpr(rMt), mkU8(shift_amt)),
+                          binop(Iop_Shl32, mkexpr(rMt),
+                                           mkU8(32-shift_amt))));
+            DIS(buf, "r%u, ROR #%u", rM, shift_amt);
+         }
+         break;
+
+      default:
+         /*NOTREACHED*/
+         vassert(0);
+   }
+}
+
+
+/* Generate an expression corresponding to the register-shift case of
+   a shifter operand.  This is used both for ARM and Thumb2.
+
+   Bind it to a temporary, and return that via *res.  If newC is
+   non-NULL, also compute a value for the shifter's carry out (in the
+   LSB of a word), bind it to a temporary, and return that via *shco.
+
+   Generates GETs from the guest state and is therefore not safe to
+   use once we start doing PUTs to it, for any given instruction.
+
+   'how' is encoded thusly:
+      00b LSL,  01b LSR,  10b ASR,  11b ROR
+   Most but not all ARM and Thumb integer insns use this encoding.
+   Be careful to ensure the right value is passed here.
+*/
+static void compute_result_and_C_after_shift_by_reg (
+               /*OUT*/HChar*  buf,
+               /*OUT*/IRTemp* res,
+               /*OUT*/IRTemp* newC,
+               IRTemp  rMt,       /* reg to shift */
+               UInt    how,       /* what kind of shift */
+               IRTemp  rSt,       /* shift amount */
+               UInt    rM,        /* only for debug printing */
+               UInt    rS         /* only for debug printing */
+            )
+{
+   vassert(how < 4);
+   switch (how) {
+      case 0: { /* LSL */
+         compute_result_and_C_after_LSL_by_reg(
+            buf, res, newC, rMt, rSt, rM, rS
+         );
+         break;
+      }
+      case 1: { /* LSR */
+         compute_result_and_C_after_LSR_by_reg(
+            buf, res, newC, rMt, rSt, rM, rS
+         );
+         break;
+      }
+      case 2: { /* ASR */
+         compute_result_and_C_after_ASR_by_reg(
+            buf, res, newC, rMt, rSt, rM, rS
+         );
+         break;
+      }
+      case 3: { /* ROR */
+         compute_result_and_C_after_ROR_by_reg(
+             buf, res, newC, rMt, rSt, rM, rS
+         );
+         break;
+      }
+      default:
+         /*NOTREACHED*/
+         vassert(0);
+   }
+}
+
+
+/* Generate an expression corresponding to a shifter_operand, bind it
+   to a temporary, and return that via *shop.  If shco is non-NULL,
+   also compute a value for the shifter's carry out (in the LSB of a
+   word), bind it to a temporary, and return that via *shco.
+
+   If for some reason we can't come up with a shifter operand (missing
+   case?  not really a shifter operand?) return False.
+
+   Generates GETs from the guest state and is therefore not safe to
+   use once we start doing PUTs to it, for any given instruction.
+
+   For ARM insns only; not for Thumb.
+*/
+static Bool mk_shifter_operand ( UInt insn_25, UInt insn_11_0,
+                                 /*OUT*/IRTemp* shop,
+                                 /*OUT*/IRTemp* shco,
+                                 /*OUT*/HChar* buf )
+{
+   UInt insn_4 = (insn_11_0 >> 4) & 1;
+   UInt insn_7 = (insn_11_0 >> 7) & 1;
+   vassert(insn_25 <= 0x1);
+   vassert(insn_11_0 <= 0xFFF);
+
+   vassert(shop && *shop == IRTemp_INVALID);
+   *shop = newTemp(Ity_I32);
+
+   if (shco) {
+      vassert(*shco == IRTemp_INVALID);
+      *shco = newTemp(Ity_I32);
+   }
+
+   /* 32-bit immediate */
+
+   if (insn_25 == 1) {
+      /* immediate: (7:0) rotated right by 2 * (11:8) */
+      UInt imm = (insn_11_0 >> 0) & 0xFF;
+      UInt rot = 2 * ((insn_11_0 >> 8) & 0xF);
+      vassert(rot <= 30);
+      imm = ROR32(imm, rot);
+      if (shco) {
+         if (rot == 0) {
+            assign( *shco, mk_armg_calculate_flag_c() );
+         } else {
+            assign( *shco, mkU32( (imm >> 31) & 1 ) );
+         }
+      }
+      DIS(buf, "#0x%x", imm);
+      assign( *shop, mkU32(imm) );
+      return True;
+   }
+
+   /* Shift/rotate by immediate */
+
+   if (insn_25 == 0 && insn_4 == 0) {
+      /* Rm (3:0) shifted (6:5) by immediate (11:7) */
+      UInt shift_amt = (insn_11_0 >> 7) & 0x1F;
+      UInt rM        = (insn_11_0 >> 0) & 0xF;
+      UInt how       = (insn_11_0 >> 5) & 3;
+      /* how: 00 = Shl, 01 = Shr, 10 = Sar, 11 = Ror */
+      IRTemp rMt = newTemp(Ity_I32);
+      assign(rMt, getIRegA(rM));
+
+      vassert(shift_amt <= 31);
+
+      compute_result_and_C_after_shift_by_imm5(
+         buf, shop, shco, rMt, how, shift_amt, rM
+      );
+      return True;
+   }
+
+   /* Shift/rotate by register */
+   if (insn_25 == 0 && insn_4 == 1) {
+      /* Rm (3:0) shifted (6:5) by Rs (11:8) */
+      UInt rM  = (insn_11_0 >> 0) & 0xF;
+      UInt rS  = (insn_11_0 >> 8) & 0xF;
+      UInt how = (insn_11_0 >> 5) & 3;
+      /* how: 00 = Shl, 01 = Shr, 10 = Sar, 11 = Ror */
+      IRTemp rMt = newTemp(Ity_I32);
+      IRTemp rSt = newTemp(Ity_I32);
+
+      if (insn_7 == 1)
+         return False; /* not really a shifter operand */
+
+      assign(rMt, getIRegA(rM));
+      assign(rSt, getIRegA(rS));
+
+      compute_result_and_C_after_shift_by_reg(
+         buf, shop, shco, rMt, how, rSt, rM, rS
+      );
+      return True;
+   }
+
+   vex_printf("mk_shifter_operand(0x%x,0x%x)\n", insn_25, insn_11_0 );
+   return False;
+}
+
+
+/* ARM only */
+static 
+IRExpr* mk_EA_reg_plusminus_imm12 ( UInt rN, UInt bU, UInt imm12,
+                                    /*OUT*/HChar* buf )
+{
+   vassert(rN < 16);
+   vassert(bU < 2);
+   vassert(imm12 < 0x1000);
+   UChar opChar = bU == 1 ? '+' : '-';
+   DIS(buf, "[r%u, #%c%u]", rN, opChar, imm12);
+   return
+      binop( (bU == 1 ? Iop_Add32 : Iop_Sub32),
+             getIRegA(rN),
+             mkU32(imm12) );
+}
+
+
+/* ARM only.
+   NB: This is "DecodeImmShift" in newer versions of the the ARM ARM.
+*/
+static
+IRExpr* mk_EA_reg_plusminus_shifted_reg ( UInt rN, UInt bU, UInt rM,
+                                          UInt sh2, UInt imm5,
+                                          /*OUT*/HChar* buf )
+{
+   vassert(rN < 16);
+   vassert(bU < 2);
+   vassert(rM < 16);
+   vassert(sh2 < 4);
+   vassert(imm5 < 32);
+   UChar   opChar = bU == 1 ? '+' : '-';
+   IRExpr* index  = NULL;
+   switch (sh2) {
+      case 0: /* LSL */
+         /* imm5 can be in the range 0 .. 31 inclusive. */
+         index = binop(Iop_Shl32, getIRegA(rM), mkU8(imm5));
+         DIS(buf, "[r%u, %c r%u LSL #%u]", rN, opChar, rM, imm5); 
+         break;
+      case 1: /* LSR */
+         if (imm5 == 0) {
+            index = mkU32(0);
+            vassert(0); // ATC
+         } else {
+            index = binop(Iop_Shr32, getIRegA(rM), mkU8(imm5));
+         }
+         DIS(buf, "[r%u, %cr%u, LSR #%u]",
+                  rN, opChar, rM, imm5 == 0 ? 32 : imm5); 
+         break;
+      case 2: /* ASR */
+         /* Doesn't this just mean that the behaviour with imm5 == 0
+            is the same as if it had been 31 ? */
+         if (imm5 == 0) {
+            index = binop(Iop_Sar32, getIRegA(rM), mkU8(31));
+            vassert(0); // ATC
+         } else {
+            index = binop(Iop_Sar32, getIRegA(rM), mkU8(imm5));
+         }
+         DIS(buf, "[r%u, %cr%u, ASR #%u]",
+                  rN, opChar, rM, imm5 == 0 ? 32 : imm5); 
+         break;
+      case 3: /* ROR or RRX */
+         if (imm5 == 0) {
+            IRTemp rmT    = newTemp(Ity_I32);
+            IRTemp cflagT = newTemp(Ity_I32);
+            assign(rmT, getIRegA(rM));
+            assign(cflagT, mk_armg_calculate_flag_c());
+            index = binop(Iop_Or32, 
+                          binop(Iop_Shl32, mkexpr(cflagT), mkU8(31)),
+                          binop(Iop_Shr32, mkexpr(rmT), mkU8(1)));
+            DIS(buf, "[r%u, %cr%u, RRX]", rN, opChar, rM);
+         } else {
+            IRTemp rmT = newTemp(Ity_I32);
+            assign(rmT, getIRegA(rM));
+            vassert(imm5 >= 1 && imm5 <= 31);
+            index = binop(Iop_Or32, 
+                          binop(Iop_Shl32, mkexpr(rmT), mkU8(32-imm5)),
+                          binop(Iop_Shr32, mkexpr(rmT), mkU8(imm5)));
+            DIS(buf, "[r%u, %cr%u, ROR #%u]", rN, opChar, rM, imm5); 
+         }
+         break;
+      default:
+         vassert(0);
+   }
+   vassert(index);
+   return binop(bU == 1 ? Iop_Add32 : Iop_Sub32,
+                getIRegA(rN), index);
+}
+
+
+/* ARM only */
+static 
+IRExpr* mk_EA_reg_plusminus_imm8 ( UInt rN, UInt bU, UInt imm8,
+                                   /*OUT*/HChar* buf )
+{
+   vassert(rN < 16);
+   vassert(bU < 2);
+   vassert(imm8 < 0x100);
+   UChar opChar = bU == 1 ? '+' : '-';
+   DIS(buf, "[r%u, #%c%u]", rN, opChar, imm8);
+   return
+      binop( (bU == 1 ? Iop_Add32 : Iop_Sub32),
+             getIRegA(rN),
+             mkU32(imm8) );
+}
+
+
+/* ARM only */
+static
+IRExpr* mk_EA_reg_plusminus_reg ( UInt rN, UInt bU, UInt rM,
+                                  /*OUT*/HChar* buf )
+{
+   vassert(rN < 16);
+   vassert(bU < 2);
+   vassert(rM < 16);
+   UChar   opChar = bU == 1 ? '+' : '-';
+   IRExpr* index  = getIRegA(rM);
+   DIS(buf, "[r%u, %c r%u]", rN, opChar, rM); 
+   return binop(bU == 1 ? Iop_Add32 : Iop_Sub32,
+                getIRegA(rN), index);
+}
+
+
+/* irRes :: Ity_I32 holds a floating point comparison result encoded
+   as an IRCmpF64Result.  Generate code to convert it to an
+   ARM-encoded (N,Z,C,V) group in the lowest 4 bits of an I32 value.
+   Assign a new temp to hold that value, and return the temp. */
+static
+IRTemp mk_convert_IRCmpF64Result_to_NZCV ( IRTemp irRes )
+{
+   IRTemp ix       = newTemp(Ity_I32);
+   IRTemp termL    = newTemp(Ity_I32);
+   IRTemp termR    = newTemp(Ity_I32);
+   IRTemp nzcv     = newTemp(Ity_I32);
+
+   /* This is where the fun starts.  We have to convert 'irRes' from
+      an IR-convention return result (IRCmpF64Result) to an
+      ARM-encoded (N,Z,C,V) group.  The final result is in the bottom
+      4 bits of 'nzcv'. */
+   /* Map compare result from IR to ARM(nzcv) */
+   /*
+      FP cmp result | IR   | ARM(nzcv)
+      --------------------------------
+      UN              0x45   0011
+      LT              0x01   1000
+      GT              0x00   0010
+      EQ              0x40   0110
+   */
+   /* Now since you're probably wondering WTF ..
+
+      ix fishes the useful bits out of the IR value, bits 6 and 0, and
+      places them side by side, giving a number which is 0, 1, 2 or 3.
+
+      termL is a sequence cooked up by GNU superopt.  It converts ix
+         into an almost correct value NZCV value (incredibly), except
+         for the case of UN, where it produces 0100 instead of the
+         required 0011.
+
+      termR is therefore a correction term, also computed from ix.  It
+         is 1 in the UN case and 0 for LT, GT and UN.  Hence, to get
+         the final correct value, we subtract termR from termL.
+
+      Don't take my word for it.  There's a test program at the bottom
+      of this file, to try this out with.
+   */
+   assign(
+      ix,
+      binop(Iop_Or32,
+            binop(Iop_And32,
+                  binop(Iop_Shr32, mkexpr(irRes), mkU8(5)),
+                  mkU32(3)),
+            binop(Iop_And32, mkexpr(irRes), mkU32(1))));
+
+   assign(
+      termL,
+      binop(Iop_Add32,
+            binop(Iop_Shr32,
+                  binop(Iop_Sub32,
+                        binop(Iop_Shl32,
+                              binop(Iop_Xor32, mkexpr(ix), mkU32(1)),
+                              mkU8(30)),
+                        mkU32(1)),
+                  mkU8(29)),
+            mkU32(1)));
+
+   assign(
+      termR,
+      binop(Iop_And32,
+            binop(Iop_And32,
+                  mkexpr(ix),
+                  binop(Iop_Shr32, mkexpr(ix), mkU8(1))),
+            mkU32(1)));
+
+   assign(nzcv, binop(Iop_Sub32, mkexpr(termL), mkexpr(termR)));
+   return nzcv;
+}
+
+
+/* Thumb32 only.  This is "ThumbExpandImm" in the ARM ARM.  If
+   updatesC is non-NULL, a boolean is written to it indicating whether
+   or not the C flag is updated, as per ARM ARM "ThumbExpandImm_C".
+*/
+static UInt thumbExpandImm ( Bool* updatesC,
+                             UInt imm1, UInt imm3, UInt imm8 )
+{
+   vassert(imm1 < (1<<1));
+   vassert(imm3 < (1<<3));
+   vassert(imm8 < (1<<8));
+   UInt i_imm3_a = (imm1 << 4) | (imm3 << 1) | ((imm8 >> 7) & 1);
+   UInt abcdefgh = imm8;
+   UInt lbcdefgh = imm8 | 0x80;
+   if (updatesC) {
+      *updatesC = i_imm3_a >= 8;
+   }
+   switch (i_imm3_a) {
+      case 0: case 1:
+         return abcdefgh;
+      case 2: case 3:
+         return (abcdefgh << 16) | abcdefgh;
+      case 4: case 5:
+         return (abcdefgh << 24) | (abcdefgh << 8);
+      case 6: case 7:
+         return (abcdefgh << 24) | (abcdefgh << 16)
+                | (abcdefgh << 8) | abcdefgh;
+      case 8 ... 31:
+         return lbcdefgh << (32 - i_imm3_a);
+      default:
+         break;
+   }
+   /*NOTREACHED*/vassert(0);
+}
+
+
+/* Version of thumbExpandImm where we simply feed it the
+   instruction halfwords (the lowest addressed one is I0). */
+static UInt thumbExpandImm_from_I0_I1 ( Bool* updatesC,
+                                        UShort i0s, UShort i1s )
+{
+   UInt i0    = (UInt)i0s;
+   UInt i1    = (UInt)i1s;
+   UInt imm1  = SLICE_UInt(i0,10,10);
+   UInt imm3  = SLICE_UInt(i1,14,12);
+   UInt imm8  = SLICE_UInt(i1,7,0);
+   return thumbExpandImm(updatesC, imm1, imm3, imm8);
+}
+
+
+/* Thumb16 only.  Given the firstcond and mask fields from an IT
+   instruction, compute the 32-bit ITSTATE value implied, as described
+   in libvex_guest_arm.h.  This is not the ARM ARM representation.
+   Also produce the t/e chars for the 2nd, 3rd, 4th insns, for
+   disassembly printing.  Returns False if firstcond or mask
+   denote something invalid.
+
+   The number and conditions for the instructions to be
+   conditionalised depend on firstcond and mask:
+
+   mask      cond 1    cond 2      cond 3      cond 4
+
+   1000      fc[3:0]
+   x100      fc[3:0]   fc[3:1]:x
+   xy10      fc[3:0]   fc[3:1]:x   fc[3:1]:y
+   xyz1      fc[3:0]   fc[3:1]:x   fc[3:1]:y   fc[3:1]:z
+
+   The condition fields are assembled in *itstate backwards (cond 4 at
+   the top, cond 1 at the bottom).  Conditions are << 4'd and then
+   ^0xE'd, and those fields that correspond to instructions in the IT
+   block are tagged with a 1 bit.
+*/
+static Bool compute_ITSTATE ( /*OUT*/UInt*  itstate,
+                              /*OUT*/UChar* ch1,
+                              /*OUT*/UChar* ch2,
+                              /*OUT*/UChar* ch3,
+                              UInt firstcond, UInt mask )
+{
+   vassert(firstcond <= 0xF);
+   vassert(mask <= 0xF);
+   *itstate = 0;
+   *ch1 = *ch2 = *ch3 = '.';
+   if (mask == 0)
+      return False; /* the logic below actually ensures this anyway,
+                       but clearer to make it explicit. */
+   if (firstcond == 0xF)
+      return False; /* NV is not allowed */
+   if (firstcond == 0xE && popcount32(mask) != 1) 
+      return False; /* if firstcond is AL then all the rest must be too */
+
+   UInt m3 = (mask >> 3) & 1;
+   UInt m2 = (mask >> 2) & 1;
+   UInt m1 = (mask >> 1) & 1;
+   UInt m0 = (mask >> 0) & 1;
+
+   UInt fc = (firstcond << 4) | 1/*in-IT-block*/;
+   UInt ni = (0xE/*AL*/ << 4) | 0/*not-in-IT-block*/;
+
+   if (m3 == 1 && (m2|m1|m0) == 0) {
+      *itstate = (ni << 24) | (ni << 16) | (ni << 8) | fc;
+      *itstate ^= 0xE0E0E0E0;
+      return True;
+   }
+
+   if (m2 == 1 && (m1|m0) == 0) {
+      *itstate = (ni << 24) | (ni << 16) | (setbit32(fc, 4, m3) << 8) | fc;
+      *itstate ^= 0xE0E0E0E0;
+      *ch1 = m3 == (firstcond & 1) ? 't' : 'e';
+      return True;
+   }
+
+   if (m1 == 1 && m0 == 0) {
+      *itstate = (ni << 24)
+                 | (setbit32(fc, 4, m2) << 16)
+                 | (setbit32(fc, 4, m3) << 8) | fc;
+      *itstate ^= 0xE0E0E0E0;
+      *ch1 = m3 == (firstcond & 1) ? 't' : 'e';
+      *ch2 = m2 == (firstcond & 1) ? 't' : 'e';
+      return True;
+   }
+
+   if (m0 == 1) {
+      *itstate = (setbit32(fc, 4, m1) << 24)
+                 | (setbit32(fc, 4, m2) << 16)
+                 | (setbit32(fc, 4, m3) << 8) | fc;
+      *itstate ^= 0xE0E0E0E0;
+      *ch1 = m3 == (firstcond & 1) ? 't' : 'e';
+      *ch2 = m2 == (firstcond & 1) ? 't' : 'e';
+      *ch3 = m1 == (firstcond & 1) ? 't' : 'e';
+      return True;
+   }
+
+   return False;
+}
+
+
+/* Generate IR to do 32-bit bit reversal, a la Hacker's Delight
+   Chapter 7 Section 1. */
+static IRTemp gen_BITREV ( IRTemp x0 )
+{
+   IRTemp x1 = newTemp(Ity_I32);
+   IRTemp x2 = newTemp(Ity_I32);
+   IRTemp x3 = newTemp(Ity_I32);
+   IRTemp x4 = newTemp(Ity_I32);
+   IRTemp x5 = newTemp(Ity_I32);
+   UInt   c1 = 0x55555555;
+   UInt   c2 = 0x33333333;
+   UInt   c3 = 0x0F0F0F0F;
+   UInt   c4 = 0x00FF00FF;
+   UInt   c5 = 0x0000FFFF;
+   assign(x1,
+          binop(Iop_Or32,
+                binop(Iop_Shl32,
+                      binop(Iop_And32, mkexpr(x0), mkU32(c1)),
+                      mkU8(1)),
+                binop(Iop_Shr32,
+                      binop(Iop_And32, mkexpr(x0), mkU32(~c1)),
+                      mkU8(1))
+   ));
+   assign(x2,
+          binop(Iop_Or32,
+                binop(Iop_Shl32,
+                      binop(Iop_And32, mkexpr(x1), mkU32(c2)),
+                      mkU8(2)),
+                binop(Iop_Shr32,
+                      binop(Iop_And32, mkexpr(x1), mkU32(~c2)),
+                      mkU8(2))
+   ));
+   assign(x3,
+          binop(Iop_Or32,
+                binop(Iop_Shl32,
+                      binop(Iop_And32, mkexpr(x2), mkU32(c3)),
+                      mkU8(4)),
+                binop(Iop_Shr32,
+                      binop(Iop_And32, mkexpr(x2), mkU32(~c3)),
+                      mkU8(4))
+   ));
+   assign(x4,
+          binop(Iop_Or32,
+                binop(Iop_Shl32,
+                      binop(Iop_And32, mkexpr(x3), mkU32(c4)),
+                      mkU8(8)),
+                binop(Iop_Shr32,
+                      binop(Iop_And32, mkexpr(x3), mkU32(~c4)),
+                      mkU8(8))
+   ));
+   assign(x5,
+          binop(Iop_Or32,
+                binop(Iop_Shl32,
+                      binop(Iop_And32, mkexpr(x4), mkU32(c5)),
+                      mkU8(16)),
+                binop(Iop_Shr32,
+                      binop(Iop_And32, mkexpr(x4), mkU32(~c5)),
+                      mkU8(16))
+   ));
+   return x5;
+}
+
+
+/* Generate IR to do rearrange bytes 3:2:1:0 in a word in to the order
+   0:1:2:3 (aka byte-swap). */
+static IRTemp gen_REV ( IRTemp arg )
+{
+   IRTemp res = newTemp(Ity_I32);
+   assign(res, 
+          binop(Iop_Or32,
+                binop(Iop_Shl32, mkexpr(arg), mkU8(24)),
+          binop(Iop_Or32,
+                binop(Iop_And32, binop(Iop_Shl32, mkexpr(arg), mkU8(8)), 
+                                 mkU32(0x00FF0000)),
+          binop(Iop_Or32,
+                binop(Iop_And32, binop(Iop_Shr32, mkexpr(arg), mkU8(8)),
+                                       mkU32(0x0000FF00)),
+                binop(Iop_And32, binop(Iop_Shr32, mkexpr(arg), mkU8(24)),
+                                       mkU32(0x000000FF) )
+   ))));
+   return res;
+}
+
+
+/* Generate IR to do rearrange bytes 3:2:1:0 in a word in to the order
+   2:3:0:1 (swap within lo and hi halves). */
+static IRTemp gen_REV16 ( IRTemp arg )
+{
+   IRTemp res = newTemp(Ity_I32);
+   assign(res,
+          binop(Iop_Or32,
+                binop(Iop_And32,
+                      binop(Iop_Shl32, mkexpr(arg), mkU8(8)),
+                      mkU32(0xFF00FF00)),
+                binop(Iop_And32,
+                      binop(Iop_Shr32, mkexpr(arg), mkU8(8)),
+                      mkU32(0x00FF00FF))));
+   return res;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Advanced SIMD (NEON) instructions                    ---*/
+/*------------------------------------------------------------*/
+
+/*------------------------------------------------------------*/
+/*--- NEON data processing                                 ---*/
+/*------------------------------------------------------------*/
+
+/* For all NEON DP ops, we use the normal scheme to handle conditional
+   writes to registers -- pass in condT and hand that on to the
+   put*Reg functions.  In ARM mode condT is always IRTemp_INVALID
+   since NEON is unconditional for ARM.  In Thumb mode condT is
+   derived from the ITSTATE shift register in the normal way. */
+
+static
+UInt get_neon_d_regno(UInt theInstr)
+{
+   UInt x = ((theInstr >> 18) & 0x10) | ((theInstr >> 12) & 0xF);
+   if (theInstr & 0x40) {
+      if (x & 1) {
+         x = x + 0x100;
+      } else {
+         x = x >> 1;
+      }
+   }
+   return x;
+}
+
+static
+UInt get_neon_n_regno(UInt theInstr)
+{
+   UInt x = ((theInstr >> 3) & 0x10) | ((theInstr >> 16) & 0xF);
+   if (theInstr & 0x40) {
+      if (x & 1) {
+         x = x + 0x100;
+      } else {
+         x = x >> 1;
+      }
+   }
+   return x;
+}
+
+static
+UInt get_neon_m_regno(UInt theInstr)
+{
+   UInt x = ((theInstr >> 1) & 0x10) | (theInstr & 0xF);
+   if (theInstr & 0x40) {
+      if (x & 1) {
+         x = x + 0x100;
+      } else {
+         x = x >> 1;
+      }
+   }
+   return x;
+}
+
+static
+Bool dis_neon_vext ( UInt theInstr, IRTemp condT )
+{
+   UInt dreg = get_neon_d_regno(theInstr);
+   UInt mreg = get_neon_m_regno(theInstr);
+   UInt nreg = get_neon_n_regno(theInstr);
+   UInt imm4 = (theInstr >> 8) & 0xf;
+   UInt Q = (theInstr >> 6) & 1;
+   HChar reg_t = Q ? 'q' : 'd';
+
+   if (Q) {
+      putQReg(dreg, triop(Iop_ExtractV128, getQReg(nreg),
+               getQReg(mreg), mkU8(imm4)), condT);
+   } else {
+      putDRegI64(dreg, triop(Iop_Extract64, getDRegI64(nreg),
+                 getDRegI64(mreg), mkU8(imm4)), condT);
+   }
+   DIP("vext.8 %c%d, %c%d, %c%d, #%d\n", reg_t, dreg, reg_t, nreg,
+                                         reg_t, mreg, imm4);
+   return True;
+}
+
+/* VTBL, VTBX */
+static
+Bool dis_neon_vtb ( UInt theInstr, IRTemp condT )
+{
+   UInt op = (theInstr >> 6) & 1;
+   UInt dreg = get_neon_d_regno(theInstr & ~(1 << 6));
+   UInt nreg = get_neon_n_regno(theInstr & ~(1 << 6));
+   UInt mreg = get_neon_m_regno(theInstr & ~(1 << 6));
+   UInt len = (theInstr >> 8) & 3;
+   Int i;
+   IROp cmp;
+   ULong imm;
+   IRTemp arg_l;
+   IRTemp old_mask, new_mask, cur_mask;
+   IRTemp old_res, new_res;
+   IRTemp old_arg, new_arg;
+
+   if (dreg >= 0x100 || mreg >= 0x100 || nreg >= 0x100)
+      return False;
+   if (nreg + len > 31)
+      return False;
+
+   cmp = Iop_CmpGT8Ux8;
+
+   old_mask = newTemp(Ity_I64);
+   old_res = newTemp(Ity_I64);
+   old_arg = newTemp(Ity_I64);
+   assign(old_mask, mkU64(0));
+   assign(old_res, mkU64(0));
+   assign(old_arg, getDRegI64(mreg));
+   imm = 8;
+   imm = (imm <<  8) | imm;
+   imm = (imm << 16) | imm;
+   imm = (imm << 32) | imm;
+
+   for (i = 0; i <= len; i++) {
+      arg_l = newTemp(Ity_I64);
+      new_mask = newTemp(Ity_I64);
+      cur_mask = newTemp(Ity_I64);
+      new_res = newTemp(Ity_I64);
+      new_arg = newTemp(Ity_I64);
+      assign(arg_l, getDRegI64(nreg+i));
+      assign(new_arg, binop(Iop_Sub8x8, mkexpr(old_arg), mkU64(imm)));
+      assign(cur_mask, binop(cmp, mkU64(imm), mkexpr(old_arg)));
+      assign(new_mask, binop(Iop_Or64, mkexpr(old_mask), mkexpr(cur_mask)));
+      assign(new_res, binop(Iop_Or64,
+                            mkexpr(old_res),
+                            binop(Iop_And64,
+                                  binop(Iop_Perm8x8,
+                                        mkexpr(arg_l),
+                                        binop(Iop_And64,
+                                              mkexpr(old_arg),
+                                              mkexpr(cur_mask))),
+                                  mkexpr(cur_mask))));
+
+      old_arg = new_arg;
+      old_mask = new_mask;
+      old_res = new_res;
+   }
+   if (op) {
+      new_res = newTemp(Ity_I64);
+      assign(new_res, binop(Iop_Or64,
+                            binop(Iop_And64,
+                                  getDRegI64(dreg),
+                                  unop(Iop_Not64, mkexpr(old_mask))),
+                            mkexpr(old_res)));
+      old_res = new_res;
+   }
+
+   putDRegI64(dreg, mkexpr(old_res), condT);
+   DIP("vtb%c.8 d%u, {", op ? 'x' : 'l', dreg);
+   if (len > 0) {
+      DIP("d%u-d%u", nreg, nreg + len);
+   } else {
+      DIP("d%u", nreg);
+   }
+   DIP("}, d%u\n", mreg);
+   return True;
+}
+
+/* VDUP (scalar)  */
+static
+Bool dis_neon_vdup ( UInt theInstr, IRTemp condT )
+{
+   UInt Q = (theInstr >> 6) & 1;
+   UInt dreg = ((theInstr >> 18) & 0x10) | ((theInstr >> 12) & 0xF);
+   UInt mreg = ((theInstr >> 1) & 0x10) | (theInstr & 0xF);
+   UInt imm4 = (theInstr >> 16) & 0xF;
+   UInt index;
+   UInt size;
+   IRTemp arg_m;
+   IRTemp res;
+   IROp op, op2;
+
+   if ((imm4 == 0) || (imm4 == 8))
+      return False;
+   if ((Q == 1) && ((dreg & 1) == 1))
+      return False;
+   if (Q)
+      dreg >>= 1;
+   arg_m = newTemp(Ity_I64);
+   assign(arg_m, getDRegI64(mreg));
+   if (Q)
+      res = newTemp(Ity_V128);
+   else
+      res = newTemp(Ity_I64);
+   if ((imm4 & 1) == 1) {
+      op = Q ? Iop_Dup8x16 : Iop_Dup8x8;
+      op2 = Iop_GetElem8x8;
+      index = imm4 >> 1;
+      size = 8;
+   } else if ((imm4 & 3) == 2) {
+      op = Q ? Iop_Dup16x8 : Iop_Dup16x4;
+      op2 = Iop_GetElem16x4;
+      index = imm4 >> 2;
+      size = 16;
+   } else if ((imm4 & 7) == 4) {
+      op = Q ? Iop_Dup32x4 : Iop_Dup32x2;
+      op2 = Iop_GetElem32x2;
+      index = imm4 >> 3;
+      size = 32;
+   } else {
+      return False; // can this ever happen?
+   }
+   assign(res, unop(op, binop(op2, mkexpr(arg_m), mkU8(index))));
+   if (Q) {
+      putQReg(dreg, mkexpr(res), condT);
+   } else {
+      putDRegI64(dreg, mkexpr(res), condT);
+   }
+   DIP("vdup.%d %c%d, d%d[%d]\n", size, Q ? 'q' : 'd', dreg, mreg, index);
+   return True;
+}
+
+/* A7.4.1 Three registers of the same length */
+static
+Bool dis_neon_data_3same ( UInt theInstr, IRTemp condT )
+{
+   UInt Q = (theInstr >> 6) & 1;
+   UInt dreg = get_neon_d_regno(theInstr);
+   UInt nreg = get_neon_n_regno(theInstr);
+   UInt mreg = get_neon_m_regno(theInstr);
+   UInt A = (theInstr >> 8) & 0xF;
+   UInt B = (theInstr >> 4) & 1;
+   UInt C = (theInstr >> 20) & 0x3;
+   UInt U = (theInstr >> 24) & 1;
+   UInt size = C;
+
+   IRTemp arg_n;
+   IRTemp arg_m;
+   IRTemp res;
+
+   if (Q) {
+      arg_n = newTemp(Ity_V128);
+      arg_m = newTemp(Ity_V128);
+      res = newTemp(Ity_V128);
+      assign(arg_n, getQReg(nreg));
+      assign(arg_m, getQReg(mreg));
+   } else {
+      arg_n = newTemp(Ity_I64);
+      arg_m = newTemp(Ity_I64);
+      res = newTemp(Ity_I64);
+      assign(arg_n, getDRegI64(nreg));
+      assign(arg_m, getDRegI64(mreg));
+   }
+
+   switch(A) {
+      case 0:
+         if (B == 0) {
+            /* VHADD */
+            ULong imm = 0;
+            IRExpr *imm_val;
+            IROp addOp;
+            IROp andOp;
+            IROp shOp;
+            char regType = Q ? 'q' : 'd';
+
+            if (size == 3)
+               return False;
+            switch(size) {
+               case 0: imm = 0x101010101010101LL; break;
+               case 1: imm = 0x1000100010001LL; break;
+               case 2: imm = 0x100000001LL; break;
+               default: vassert(0);
+            }
+            if (Q) {
+               imm_val = binop(Iop_64HLtoV128, mkU64(imm), mkU64(imm));
+               andOp = Iop_AndV128;
+            } else {
+               imm_val = mkU64(imm);
+               andOp = Iop_And64;
+            }
+            if (U) {
+               switch(size) {
+                  case 0:
+                     addOp = Q ? Iop_Add8x16 : Iop_Add8x8;
+                     shOp = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
+                     break;
+                  case 1:
+                     addOp = Q ? Iop_Add16x8 : Iop_Add16x4;
+                     shOp = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
+                     break;
+                  case 2:
+                     addOp = Q ? Iop_Add32x4 : Iop_Add32x2;
+                     shOp = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               switch(size) {
+                  case 0:
+                     addOp = Q ? Iop_Add8x16 : Iop_Add8x8;
+                     shOp = Q ? Iop_SarN8x16 : Iop_SarN8x8;
+                     break;
+                  case 1:
+                     addOp = Q ? Iop_Add16x8 : Iop_Add16x4;
+                     shOp = Q ? Iop_SarN16x8 : Iop_SarN16x4;
+                     break;
+                  case 2:
+                     addOp = Q ? Iop_Add32x4 : Iop_Add32x2;
+                     shOp = Q ? Iop_SarN32x4 : Iop_SarN32x2;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            }
+            assign(res,
+                   binop(addOp,
+                         binop(addOp,
+                               binop(shOp, mkexpr(arg_m), mkU8(1)),
+                               binop(shOp, mkexpr(arg_n), mkU8(1))),
+                         binop(shOp,
+                               binop(addOp,
+                                     binop(andOp, mkexpr(arg_m), imm_val),
+                                     binop(andOp, mkexpr(arg_n), imm_val)),
+                               mkU8(1))));
+            DIP("vhadd.%c%d %c%d, %c%d, %c%d\n",
+                U ? 'u' : 's', 8 << size, regType,
+                dreg, regType, nreg, regType, mreg);
+         } else {
+            /* VQADD */
+            IROp op, op2;
+            IRTemp tmp;
+            char reg_t = Q ? 'q' : 'd';
+            if (Q) {
+               switch (size) {
+                  case 0:
+                     op = U ? Iop_QAdd8Ux16 : Iop_QAdd8Sx16;
+                     op2 = Iop_Add8x16;
+                     break;
+                  case 1:
+                     op = U ? Iop_QAdd16Ux8 : Iop_QAdd16Sx8;
+                     op2 = Iop_Add16x8;
+                     break;
+                  case 2:
+                     op = U ? Iop_QAdd32Ux4 : Iop_QAdd32Sx4;
+                     op2 = Iop_Add32x4;
+                     break;
+                  case 3:
+                     op = U ? Iop_QAdd64Ux2 : Iop_QAdd64Sx2;
+                     op2 = Iop_Add64x2;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0:
+                     op = U ? Iop_QAdd8Ux8 : Iop_QAdd8Sx8;
+                     op2 = Iop_Add8x8;
+                     break;
+                  case 1:
+                     op = U ? Iop_QAdd16Ux4 : Iop_QAdd16Sx4;
+                     op2 = Iop_Add16x4;
+                     break;
+                  case 2:
+                     op = U ? Iop_QAdd32Ux2 : Iop_QAdd32Sx2;
+                     op2 = Iop_Add32x2;
+                     break;
+                  case 3:
+                     op = U ? Iop_QAdd64Ux1 : Iop_QAdd64Sx1;
+                     op2 = Iop_Add64;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            }
+            if (Q) {
+               tmp = newTemp(Ity_V128);
+            } else {
+               tmp = newTemp(Ity_I64);
+            }
+            assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+#ifndef DISABLE_QC_FLAG
+            assign(tmp, binop(op2, mkexpr(arg_n), mkexpr(arg_m)));
+            setFlag_QC(mkexpr(res), mkexpr(tmp), Q, condT);
+#endif
+            DIP("vqadd.%c%d %c%d, %c%d, %c%d\n",
+                U ? 'u' : 's',
+                8 << size, reg_t, dreg, reg_t, nreg, reg_t, mreg);
+         }
+         break;
+      case 1:
+         if (B == 0) {
+            /* VRHADD */
+            /* VRHADD C, A, B ::=
+                 C = (A >> 1) + (B >> 1) + (((A & 1) + (B & 1) + 1) >> 1) */
+            IROp shift_op, add_op;
+            IRTemp cc;
+            ULong one = 1;
+            HChar reg_t = Q ? 'q' : 'd';
+            switch (size) {
+               case 0: one = (one <<  8) | one; /* fall through */
+               case 1: one = (one << 16) | one; /* fall through */
+               case 2: one = (one << 32) | one; break;
+               case 3: return False;
+               default: vassert(0);
+            }
+            if (Q) {
+               switch (size) {
+                  case 0:
+                     shift_op = U ? Iop_ShrN8x16 : Iop_SarN8x16;
+                     add_op = Iop_Add8x16;
+                     break;
+                  case 1:
+                     shift_op = U ? Iop_ShrN16x8 : Iop_SarN16x8;
+                     add_op = Iop_Add16x8;
+                     break;
+                  case 2:
+                     shift_op = U ? Iop_ShrN32x4 : Iop_SarN32x4;
+                     add_op = Iop_Add32x4;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0:
+                     shift_op = U ? Iop_ShrN8x8 : Iop_SarN8x8;
+                     add_op = Iop_Add8x8;
+                     break;
+                  case 1:
+                     shift_op = U ? Iop_ShrN16x4 : Iop_SarN16x4;
+                     add_op = Iop_Add16x4;
+                     break;
+                  case 2:
+                     shift_op = U ? Iop_ShrN32x2 : Iop_SarN32x2;
+                     add_op = Iop_Add32x2;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+            }
+            if (Q) {
+               cc = newTemp(Ity_V128);
+               assign(cc, binop(shift_op,
+                                binop(add_op,
+                                      binop(add_op,
+                                            binop(Iop_AndV128,
+                                                  mkexpr(arg_n),
+                                                  binop(Iop_64HLtoV128,
+                                                        mkU64(one),
+                                                        mkU64(one))),
+                                            binop(Iop_AndV128,
+                                                  mkexpr(arg_m),
+                                                  binop(Iop_64HLtoV128,
+                                                        mkU64(one),
+                                                        mkU64(one)))),
+                                      binop(Iop_64HLtoV128,
+                                            mkU64(one),
+                                            mkU64(one))),
+                                mkU8(1)));
+               assign(res, binop(add_op,
+                                 binop(add_op,
+                                       binop(shift_op,
+                                             mkexpr(arg_n),
+                                             mkU8(1)),
+                                       binop(shift_op,
+                                             mkexpr(arg_m),
+                                             mkU8(1))),
+                                 mkexpr(cc)));
+            } else {
+               cc = newTemp(Ity_I64);
+               assign(cc, binop(shift_op,
+                                binop(add_op,
+                                      binop(add_op,
+                                            binop(Iop_And64,
+                                                  mkexpr(arg_n),
+                                                  mkU64(one)),
+                                            binop(Iop_And64,
+                                                  mkexpr(arg_m),
+                                                  mkU64(one))),
+                                      mkU64(one)),
+                                mkU8(1)));
+               assign(res, binop(add_op,
+                                 binop(add_op,
+                                       binop(shift_op,
+                                             mkexpr(arg_n),
+                                             mkU8(1)),
+                                       binop(shift_op,
+                                             mkexpr(arg_m),
+                                             mkU8(1))),
+                                 mkexpr(cc)));
+            }
+            DIP("vrhadd.%c%d %c%d, %c%d, %c%d\n",
+                U ? 'u' : 's',
+                8 << size, reg_t, dreg, reg_t, nreg, reg_t, mreg);
+         } else {
+            if (U == 0)  {
+               switch(C) {
+                  case 0: {
+                     /* VAND  */
+                     HChar reg_t = Q ? 'q' : 'd';
+                     if (Q) {
+                        assign(res, binop(Iop_AndV128, mkexpr(arg_n), 
+                                                       mkexpr(arg_m)));
+                     } else {
+                        assign(res, binop(Iop_And64, mkexpr(arg_n),
+                                                     mkexpr(arg_m)));
+                     }
+                     DIP("vand %c%d, %c%d, %c%d\n",
+                         reg_t, dreg, reg_t, nreg, reg_t, mreg);
+                     break;
+                  }
+                  case 1: {
+                     /* VBIC  */
+                     HChar reg_t = Q ? 'q' : 'd';
+                     if (Q) {
+                        assign(res, binop(Iop_AndV128,mkexpr(arg_n),
+                               unop(Iop_NotV128, mkexpr(arg_m))));
+                     } else {
+                        assign(res, binop(Iop_And64, mkexpr(arg_n),
+                               unop(Iop_Not64, mkexpr(arg_m))));
+                     }
+                     DIP("vbic %c%d, %c%d, %c%d\n",
+                         reg_t, dreg, reg_t, nreg, reg_t, mreg);
+                     break;
+                  }
+                  case 2:
+                     if ( nreg != mreg) {
+                        /* VORR  */
+                        HChar reg_t = Q ? 'q' : 'd';
+                        if (Q) {
+                           assign(res, binop(Iop_OrV128, mkexpr(arg_n),
+                                                         mkexpr(arg_m)));
+                        } else {
+                           assign(res, binop(Iop_Or64, mkexpr(arg_n),
+                                                       mkexpr(arg_m)));
+                        }
+                        DIP("vorr %c%d, %c%d, %c%d\n",
+                            reg_t, dreg, reg_t, nreg, reg_t, mreg);
+                     } else {
+                        /* VMOV  */
+                        HChar reg_t = Q ? 'q' : 'd';
+                        assign(res, mkexpr(arg_m));
+                        DIP("vmov %c%d, %c%d\n", reg_t, dreg, reg_t, mreg);
+                     }
+                     break;
+                  case 3:{
+                     /* VORN  */
+                     HChar reg_t = Q ? 'q' : 'd';
+                     if (Q) {
+                        assign(res, binop(Iop_OrV128,mkexpr(arg_n),
+                               unop(Iop_NotV128, mkexpr(arg_m))));
+                     } else {
+                        assign(res, binop(Iop_Or64, mkexpr(arg_n),
+                               unop(Iop_Not64, mkexpr(arg_m))));
+                     }
+                     DIP("vorn %c%d, %c%d, %c%d\n",
+                         reg_t, dreg, reg_t, nreg, reg_t, mreg);
+                     break;
+                  }
+               }
+            } else {
+               switch(C) {
+                  case 0:
+                     /* VEOR (XOR)  */
+                     if (Q) {
+                        assign(res, binop(Iop_XorV128, mkexpr(arg_n),
+                                                       mkexpr(arg_m)));
+                     } else {
+                        assign(res, binop(Iop_Xor64, mkexpr(arg_n),
+                                                     mkexpr(arg_m)));
+                     }
+                     DIP("veor %c%u, %c%u, %c%u\n", Q ? 'q' : 'd', dreg,
+                           Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+                     break;
+                  case 1:
+                     /* VBSL  */
+                     if (Q) {
+                        IRTemp reg_d = newTemp(Ity_V128);
+                        assign(reg_d, getQReg(dreg));
+                        assign(res,
+                               binop(Iop_OrV128,
+                                     binop(Iop_AndV128, mkexpr(arg_n),
+                                                        mkexpr(reg_d)),
+                                     binop(Iop_AndV128,
+                                           mkexpr(arg_m),
+                                           unop(Iop_NotV128,
+                                                 mkexpr(reg_d)) ) ) );
+                     } else {
+                        IRTemp reg_d = newTemp(Ity_I64);
+                        assign(reg_d, getDRegI64(dreg));
+                        assign(res,
+                               binop(Iop_Or64,
+                                     binop(Iop_And64, mkexpr(arg_n),
+                                                      mkexpr(reg_d)),
+                                     binop(Iop_And64,
+                                           mkexpr(arg_m),
+                                           unop(Iop_Not64, mkexpr(reg_d)))));
+                     }
+                     DIP("vbsl %c%u, %c%u, %c%u\n",
+                         Q ? 'q' : 'd', dreg,
+                         Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+                     break;
+                  case 2:
+                     /* VBIT  */
+                     if (Q) {
+                        IRTemp reg_d = newTemp(Ity_V128);
+                        assign(reg_d, getQReg(dreg));
+                        assign(res,
+                               binop(Iop_OrV128,
+                                     binop(Iop_AndV128, mkexpr(arg_n), 
+                                                        mkexpr(arg_m)),
+                                     binop(Iop_AndV128,
+                                           mkexpr(reg_d),
+                                           unop(Iop_NotV128, mkexpr(arg_m)))));
+                     } else {
+                        IRTemp reg_d = newTemp(Ity_I64);
+                        assign(reg_d, getDRegI64(dreg));
+                        assign(res,
+                               binop(Iop_Or64,
+                                     binop(Iop_And64, mkexpr(arg_n),
+                                                      mkexpr(arg_m)),
+                                     binop(Iop_And64,
+                                           mkexpr(reg_d),
+                                           unop(Iop_Not64, mkexpr(arg_m)))));
+                     }
+                     DIP("vbit %c%u, %c%u, %c%u\n",
+                         Q ? 'q' : 'd', dreg,
+                         Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+                     break;
+                  case 3:
+                     /* VBIF  */
+                     if (Q) {
+                        IRTemp reg_d = newTemp(Ity_V128);
+                        assign(reg_d, getQReg(dreg));
+                        assign(res,
+                               binop(Iop_OrV128,
+                                     binop(Iop_AndV128, mkexpr(reg_d),
+                                                        mkexpr(arg_m)),
+                                     binop(Iop_AndV128,
+                                           mkexpr(arg_n),
+                                           unop(Iop_NotV128, mkexpr(arg_m)))));
+                     } else {
+                        IRTemp reg_d = newTemp(Ity_I64);
+                        assign(reg_d, getDRegI64(dreg));
+                        assign(res,
+                               binop(Iop_Or64,
+                                     binop(Iop_And64, mkexpr(reg_d),
+                                                      mkexpr(arg_m)),
+                                     binop(Iop_And64,
+                                           mkexpr(arg_n),
+                                           unop(Iop_Not64, mkexpr(arg_m)))));
+                     }
+                     DIP("vbif %c%u, %c%u, %c%u\n",
+                         Q ? 'q' : 'd', dreg,
+                         Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+                     break;
+               }
+            }
+         }
+         break;
+      case 2:
+         if (B == 0) {
+            /* VHSUB */
+            /* (A >> 1) - (B >> 1) - (NOT (A) & B & 1)   */
+            ULong imm = 0;
+            IRExpr *imm_val;
+            IROp subOp;
+            IROp notOp;
+            IROp andOp;
+            IROp shOp;
+            if (size == 3)
+               return False;
+            switch(size) {
+               case 0: imm = 0x101010101010101LL; break;
+               case 1: imm = 0x1000100010001LL; break;
+               case 2: imm = 0x100000001LL; break;
+               default: vassert(0);
+            }
+            if (Q) {
+               imm_val = binop(Iop_64HLtoV128, mkU64(imm), mkU64(imm));
+               andOp = Iop_AndV128;
+               notOp = Iop_NotV128;
+            } else {
+               imm_val = mkU64(imm);
+               andOp = Iop_And64;
+               notOp = Iop_Not64;
+            }
+            if (U) {
+               switch(size) {
+                  case 0:
+                     subOp = Q ? Iop_Sub8x16 : Iop_Sub8x8;
+                     shOp = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
+                     break;
+                  case 1:
+                     subOp = Q ? Iop_Sub16x8 : Iop_Sub16x4;
+                     shOp = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
+                     break;
+                  case 2:
+                     subOp = Q ? Iop_Sub32x4 : Iop_Sub32x2;
+                     shOp = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               switch(size) {
+                  case 0:
+                     subOp = Q ? Iop_Sub8x16 : Iop_Sub8x8;
+                     shOp = Q ? Iop_SarN8x16 : Iop_SarN8x8;
+                     break;
+                  case 1:
+                     subOp = Q ? Iop_Sub16x8 : Iop_Sub16x4;
+                     shOp = Q ? Iop_SarN16x8 : Iop_SarN16x4;
+                     break;
+                  case 2:
+                     subOp = Q ? Iop_Sub32x4 : Iop_Sub32x2;
+                     shOp = Q ? Iop_SarN32x4 : Iop_SarN32x2;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            }
+            assign(res,
+                   binop(subOp,
+                         binop(subOp,
+                               binop(shOp, mkexpr(arg_n), mkU8(1)),
+                               binop(shOp, mkexpr(arg_m), mkU8(1))),
+                         binop(andOp,
+                               binop(andOp,
+                                     unop(notOp, mkexpr(arg_n)),
+                                     mkexpr(arg_m)),
+                               imm_val)));
+            DIP("vhsub.%c%u %c%u, %c%u, %c%u\n",
+                U ? 'u' : 's', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd',
+                mreg);
+         } else {
+            /* VQSUB */
+            IROp op, op2;
+            IRTemp tmp;
+            if (Q) {
+               switch (size) {
+                  case 0:
+                     op = U ? Iop_QSub8Ux16 : Iop_QSub8Sx16;
+                     op2 = Iop_Sub8x16;
+                     break;
+                  case 1:
+                     op = U ? Iop_QSub16Ux8 : Iop_QSub16Sx8;
+                     op2 = Iop_Sub16x8;
+                     break;
+                  case 2:
+                     op = U ? Iop_QSub32Ux4 : Iop_QSub32Sx4;
+                     op2 = Iop_Sub32x4;
+                     break;
+                  case 3:
+                     op = U ? Iop_QSub64Ux2 : Iop_QSub64Sx2;
+                     op2 = Iop_Sub64x2;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0:
+                     op = U ? Iop_QSub8Ux8 : Iop_QSub8Sx8;
+                     op2 = Iop_Sub8x8;
+                     break;
+                  case 1:
+                     op = U ? Iop_QSub16Ux4 : Iop_QSub16Sx4;
+                     op2 = Iop_Sub16x4;
+                     break;
+                  case 2:
+                     op = U ? Iop_QSub32Ux2 : Iop_QSub32Sx2;
+                     op2 = Iop_Sub32x2;
+                     break;
+                  case 3:
+                     op = U ? Iop_QSub64Ux1 : Iop_QSub64Sx1;
+                     op2 = Iop_Sub64;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            }
+            if (Q)
+               tmp = newTemp(Ity_V128);
+            else
+               tmp = newTemp(Ity_I64);
+            assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+#ifndef DISABLE_QC_FLAG
+            assign(tmp, binop(op2, mkexpr(arg_n), mkexpr(arg_m)));
+            setFlag_QC(mkexpr(res), mkexpr(tmp), Q, condT);
+#endif
+            DIP("vqsub.%c%u %c%u, %c%u, %c%u\n",
+                U ? 'u' : 's', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd',
+                mreg);
+         }
+         break;
+      case 3: {
+            IROp op;
+            if (Q) {
+               switch (size) {
+                  case 0: op = U ? Iop_CmpGT8Ux16 : Iop_CmpGT8Sx16; break;
+                  case 1: op = U ? Iop_CmpGT16Ux8 : Iop_CmpGT16Sx8; break;
+                  case 2: op = U ? Iop_CmpGT32Ux4 : Iop_CmpGT32Sx4; break;
+                  case 3: return False;
+                  default: vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0: op = U ? Iop_CmpGT8Ux8 : Iop_CmpGT8Sx8; break;
+                  case 1: op = U ? Iop_CmpGT16Ux4 : Iop_CmpGT16Sx4; break;
+                  case 2: op = U ? Iop_CmpGT32Ux2: Iop_CmpGT32Sx2; break;
+                  case 3: return False;
+                  default: vassert(0);
+               }
+            }
+            if (B == 0) {
+               /* VCGT  */
+               assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+               DIP("vcgt.%c%u %c%u, %c%u, %c%u\n",
+                   U ? 'u' : 's', 8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd',
+                   mreg);
+            } else {
+               /* VCGE  */
+               /* VCGE res, argn, argm
+                    is equal to
+                  VCGT tmp, argm, argn
+                  VNOT res, tmp */
+               assign(res,
+                      unop(Q ? Iop_NotV128 : Iop_Not64,
+                           binop(op, mkexpr(arg_m), mkexpr(arg_n))));
+               DIP("vcge.%c%u %c%u, %c%u, %c%u\n",
+                   U ? 'u' : 's', 8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd',
+                   mreg);
+            }
+         }
+         break;
+      case 4:
+         if (B == 0) {
+            /* VSHL */
+            IROp op, sub_op;
+            IRTemp tmp;
+            if (U) {
+               switch (size) {
+                  case 0: op = Q ? Iop_Shl8x16 : Iop_Shl8x8; break;
+                  case 1: op = Q ? Iop_Shl16x8 : Iop_Shl16x4; break;
+                  case 2: op = Q ? Iop_Shl32x4 : Iop_Shl32x2; break;
+                  case 3: op = Q ? Iop_Shl64x2 : Iop_Shl64; break;
+                  default: vassert(0);
+               }
+            } else {
+               tmp = newTemp(Q ? Ity_V128 : Ity_I64);
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_Sar8x16 : Iop_Sar8x8;
+                     sub_op = Q ? Iop_Sub8x16 : Iop_Sub8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_Sar16x8 : Iop_Sar16x4;
+                     sub_op = Q ? Iop_Sub16x8 : Iop_Sub16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_Sar32x4 : Iop_Sar32x2;
+                     sub_op = Q ? Iop_Sub32x4 : Iop_Sub32x2;
+                     break;
+                  case 3:
+                     op = Q ? Iop_Sar64x2 : Iop_Sar64;
+                     sub_op = Q ? Iop_Sub64x2 : Iop_Sub64;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            }
+            if (U) {
+               if (!Q && (size == 3))
+                  assign(res, binop(op, mkexpr(arg_m),
+                                        unop(Iop_64to8, mkexpr(arg_n))));
+               else
+                  assign(res, binop(op, mkexpr(arg_m), mkexpr(arg_n)));
+            } else {
+               if (Q)
+                  assign(tmp, binop(sub_op,
+                                    binop(Iop_64HLtoV128, mkU64(0), mkU64(0)),
+                                    mkexpr(arg_n)));
+               else
+                  assign(tmp, binop(sub_op, mkU64(0), mkexpr(arg_n)));
+               if (!Q && (size == 3))
+                  assign(res, binop(op, mkexpr(arg_m),
+                                        unop(Iop_64to8, mkexpr(tmp))));
+               else
+                  assign(res, binop(op, mkexpr(arg_m), mkexpr(tmp)));
+            }
+            DIP("vshl.%c%u %c%u, %c%u, %c%u\n",
+                U ? 'u' : 's', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg, Q ? 'q' : 'd',
+                nreg);
+         } else {
+            /* VQSHL */
+            IROp op, op_rev, op_shrn, op_shln, cmp_neq, cmp_gt;
+            IRTemp tmp, shval, mask, old_shval;
+            UInt i;
+            ULong esize;
+            cmp_neq = Q ? Iop_CmpNEZ8x16 : Iop_CmpNEZ8x8;
+            cmp_gt = Q ? Iop_CmpGT8Sx16 : Iop_CmpGT8Sx8;
+            if (U) {
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_QShl8x16 : Iop_QShl8x8;
+                     op_rev = Q ? Iop_Shr8x16 : Iop_Shr8x8;
+                     op_shrn = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
+                     op_shln = Q ? Iop_ShlN8x16 : Iop_ShlN8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_QShl16x8 : Iop_QShl16x4;
+                     op_rev = Q ? Iop_Shr16x8 : Iop_Shr16x4;
+                     op_shrn = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
+                     op_shln = Q ? Iop_ShlN16x8 : Iop_ShlN16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_QShl32x4 : Iop_QShl32x2;
+                     op_rev = Q ? Iop_Shr32x4 : Iop_Shr32x2;
+                     op_shrn = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
+                     op_shln = Q ? Iop_ShlN32x4 : Iop_ShlN32x2;
+                     break;
+                  case 3:
+                     op = Q ? Iop_QShl64x2 : Iop_QShl64x1;
+                     op_rev = Q ? Iop_Shr64x2 : Iop_Shr64;
+                     op_shrn = Q ? Iop_ShrN64x2 : Iop_Shr64;
+                     op_shln = Q ? Iop_ShlN64x2 : Iop_Shl64;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_QSal8x16 : Iop_QSal8x8;
+                     op_rev = Q ? Iop_Sar8x16 : Iop_Sar8x8;
+                     op_shrn = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
+                     op_shln = Q ? Iop_ShlN8x16 : Iop_ShlN8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_QSal16x8 : Iop_QSal16x4;
+                     op_rev = Q ? Iop_Sar16x8 : Iop_Sar16x4;
+                     op_shrn = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
+                     op_shln = Q ? Iop_ShlN16x8 : Iop_ShlN16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_QSal32x4 : Iop_QSal32x2;
+                     op_rev = Q ? Iop_Sar32x4 : Iop_Sar32x2;
+                     op_shrn = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
+                     op_shln = Q ? Iop_ShlN32x4 : Iop_ShlN32x2;
+                     break;
+                  case 3:
+                     op = Q ? Iop_QSal64x2 : Iop_QSal64x1;
+                     op_rev = Q ? Iop_Sar64x2 : Iop_Sar64;
+                     op_shrn = Q ? Iop_ShrN64x2 : Iop_Shr64;
+                     op_shln = Q ? Iop_ShlN64x2 : Iop_Shl64;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            }
+            if (Q) {
+               tmp = newTemp(Ity_V128);
+               shval = newTemp(Ity_V128);
+               mask = newTemp(Ity_V128);
+            } else {
+               tmp = newTemp(Ity_I64);
+               shval = newTemp(Ity_I64);
+               mask = newTemp(Ity_I64);
+            }
+            assign(res, binop(op, mkexpr(arg_m), mkexpr(arg_n)));
+#ifndef DISABLE_QC_FLAG
+            /* Only least significant byte from second argument is used.
+               Copy this byte to the whole vector element. */
+            assign(shval, binop(op_shrn,
+                                binop(op_shln,
+                                       mkexpr(arg_n),
+                                       mkU8((8 << size) - 8)),
+                                mkU8((8 << size) - 8)));
+            for(i = 0; i < size; i++) {
+               old_shval = shval;
+               shval = newTemp(Q ? Ity_V128 : Ity_I64);
+               assign(shval, binop(Q ? Iop_OrV128 : Iop_Or64,
+                                   mkexpr(old_shval),
+                                   binop(op_shln,
+                                         mkexpr(old_shval),
+                                         mkU8(8 << i))));
+            }
+            /* If shift is greater or equal to the element size and
+               element is non-zero, then QC flag should be set. */
+            esize = (8 << size) - 1;
+            esize = (esize <<  8) | esize;
+            esize = (esize << 16) | esize;
+            esize = (esize << 32) | esize;
+            setFlag_QC(binop(Q ? Iop_AndV128 : Iop_And64,
+                             binop(cmp_gt, mkexpr(shval),
+                                           Q ? mkU128(esize) : mkU64(esize)),
+                             unop(cmp_neq, mkexpr(arg_m))),
+                       Q ? mkU128(0) : mkU64(0),
+                       Q, condT);
+            /* Othervise QC flag should be set if shift value is positive and
+               result beign rightshifted the same value is not equal to left
+               argument. */
+            assign(mask, binop(cmp_gt, mkexpr(shval),
+                                       Q ? mkU128(0) : mkU64(0)));
+            if (!Q && size == 3)
+               assign(tmp, binop(op_rev, mkexpr(res),
+                                         unop(Iop_64to8, mkexpr(arg_n))));
+            else
+               assign(tmp, binop(op_rev, mkexpr(res), mkexpr(arg_n)));
+            setFlag_QC(binop(Q ? Iop_AndV128 : Iop_And64,
+                             mkexpr(tmp), mkexpr(mask)),
+                       binop(Q ? Iop_AndV128 : Iop_And64,
+                             mkexpr(arg_m), mkexpr(mask)),
+                       Q, condT);
+#endif
+            DIP("vqshl.%c%u %c%u, %c%u, %c%u\n",
+                U ? 'u' : 's', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg, Q ? 'q' : 'd',
+                nreg);
+         }
+         break;
+      case 5:
+         if (B == 0) {
+            /* VRSHL */
+            IROp op, op_shrn, op_shln, cmp_gt, op_sub, op_add;
+            IRTemp shval, old_shval, imm_val, round;
+            UInt i;
+            ULong imm;
+            cmp_gt = Q ? Iop_CmpGT8Sx16 : Iop_CmpGT8Sx8;
+            imm = 1L;
+            switch (size) {
+               case 0: imm = (imm <<  8) | imm; /* fall through */
+               case 1: imm = (imm << 16) | imm; /* fall through */
+               case 2: imm = (imm << 32) | imm; /* fall through */
+               case 3: break;
+               default: vassert(0);
+            }
+            imm_val = newTemp(Q ? Ity_V128 : Ity_I64);
+            round = newTemp(Q ? Ity_V128 : Ity_I64);
+            assign(imm_val, Q ? mkU128(imm) : mkU64(imm));
+            if (U) {
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_Shl8x16 : Iop_Shl8x8;
+                     op_sub = Q ? Iop_Sub8x16 : Iop_Sub8x8;
+                     op_add = Q ? Iop_Add8x16 : Iop_Add8x8;
+                     op_shrn = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
+                     op_shln = Q ? Iop_ShlN8x16 : Iop_ShlN8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_Shl16x8 : Iop_Shl16x4;
+                     op_sub = Q ? Iop_Sub16x8 : Iop_Sub16x4;
+                     op_add = Q ? Iop_Add16x8 : Iop_Add16x4;
+                     op_shrn = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
+                     op_shln = Q ? Iop_ShlN16x8 : Iop_ShlN16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_Shl32x4 : Iop_Shl32x2;
+                     op_sub = Q ? Iop_Sub32x4 : Iop_Sub32x2;
+                     op_add = Q ? Iop_Add32x4 : Iop_Add32x2;
+                     op_shrn = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
+                     op_shln = Q ? Iop_ShlN32x4 : Iop_ShlN32x2;
+                     break;
+                  case 3:
+                     op = Q ? Iop_Shl64x2 : Iop_Shl64;
+                     op_sub = Q ? Iop_Sub64x2 : Iop_Sub64;
+                     op_add = Q ? Iop_Add64x2 : Iop_Add64;
+                     op_shrn = Q ? Iop_ShrN64x2 : Iop_Shr64;
+                     op_shln = Q ? Iop_ShlN64x2 : Iop_Shl64;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_Sal8x16 : Iop_Sal8x8;
+                     op_sub = Q ? Iop_Sub8x16 : Iop_Sub8x8;
+                     op_add = Q ? Iop_Add8x16 : Iop_Add8x8;
+                     op_shrn = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
+                     op_shln = Q ? Iop_ShlN8x16 : Iop_ShlN8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_Sal16x8 : Iop_Sal16x4;
+                     op_sub = Q ? Iop_Sub16x8 : Iop_Sub16x4;
+                     op_add = Q ? Iop_Add16x8 : Iop_Add16x4;
+                     op_shrn = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
+                     op_shln = Q ? Iop_ShlN16x8 : Iop_ShlN16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_Sal32x4 : Iop_Sal32x2;
+                     op_sub = Q ? Iop_Sub32x4 : Iop_Sub32x2;
+                     op_add = Q ? Iop_Add32x4 : Iop_Add32x2;
+                     op_shrn = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
+                     op_shln = Q ? Iop_ShlN32x4 : Iop_ShlN32x2;
+                     break;
+                  case 3:
+                     op = Q ? Iop_Sal64x2 : Iop_Sal64x1;
+                     op_sub = Q ? Iop_Sub64x2 : Iop_Sub64;
+                     op_add = Q ? Iop_Add64x2 : Iop_Add64;
+                     op_shrn = Q ? Iop_ShrN64x2 : Iop_Shr64;
+                     op_shln = Q ? Iop_ShlN64x2 : Iop_Shl64;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            }
+            if (Q) {
+               shval = newTemp(Ity_V128);
+            } else {
+               shval = newTemp(Ity_I64);
+            }
+            /* Only least significant byte from second argument is used.
+               Copy this byte to the whole vector element. */
+            assign(shval, binop(op_shrn,
+                                binop(op_shln,
+                                       mkexpr(arg_n),
+                                       mkU8((8 << size) - 8)),
+                                mkU8((8 << size) - 8)));
+            for (i = 0; i < size; i++) {
+               old_shval = shval;
+               shval = newTemp(Q ? Ity_V128 : Ity_I64);
+               assign(shval, binop(Q ? Iop_OrV128 : Iop_Or64,
+                                   mkexpr(old_shval),
+                                   binop(op_shln,
+                                         mkexpr(old_shval),
+                                         mkU8(8 << i))));
+            }
+            /* Compute the result */
+            if (!Q && size == 3 && U) {
+               assign(round, binop(Q ? Iop_AndV128 : Iop_And64,
+                                   binop(op,
+                                         mkexpr(arg_m),
+                                         unop(Iop_64to8,
+                                              binop(op_add,
+                                                    mkexpr(arg_n),
+                                                    mkexpr(imm_val)))),
+                                   binop(Q ? Iop_AndV128 : Iop_And64,
+                                         mkexpr(imm_val),
+                                         binop(cmp_gt,
+                                               Q ? mkU128(0) : mkU64(0),
+                                               mkexpr(arg_n)))));
+               assign(res, binop(op_add,
+                                 binop(op,
+                                       mkexpr(arg_m),
+                                       unop(Iop_64to8, mkexpr(arg_n))),
+                                 mkexpr(round)));
+            } else {
+               assign(round, binop(Q ? Iop_AndV128 : Iop_And64,
+                                   binop(op,
+                                         mkexpr(arg_m),
+                                         binop(op_add,
+                                               mkexpr(arg_n),
+                                               mkexpr(imm_val))),
+                                   binop(Q ? Iop_AndV128 : Iop_And64,
+                                         mkexpr(imm_val),
+                                         binop(cmp_gt,
+                                               Q ? mkU128(0) : mkU64(0),
+                                               mkexpr(arg_n)))));
+               assign(res, binop(op_add,
+                                 binop(op, mkexpr(arg_m), mkexpr(arg_n)),
+                                 mkexpr(round)));
+            }
+            DIP("vrshl.%c%u %c%u, %c%u, %c%u\n",
+                U ? 'u' : 's', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg, Q ? 'q' : 'd',
+                nreg);
+         } else {
+            /* VQRSHL */
+            IROp op, op_rev, op_shrn, op_shln, cmp_neq, cmp_gt, op_sub, op_add;
+            IRTemp tmp, shval, mask, old_shval, imm_val, round;
+            UInt i;
+            ULong esize, imm;
+            cmp_neq = Q ? Iop_CmpNEZ8x16 : Iop_CmpNEZ8x8;
+            cmp_gt = Q ? Iop_CmpGT8Sx16 : Iop_CmpGT8Sx8;
+            imm = 1L;
+            switch (size) {
+               case 0: imm = (imm <<  8) | imm; /* fall through */
+               case 1: imm = (imm << 16) | imm; /* fall through */
+               case 2: imm = (imm << 32) | imm; /* fall through */
+               case 3: break;
+               default: vassert(0);
+            }
+            imm_val = newTemp(Q ? Ity_V128 : Ity_I64);
+            round = newTemp(Q ? Ity_V128 : Ity_I64);
+            assign(imm_val, Q ? mkU128(imm) : mkU64(imm));
+            if (U) {
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_QShl8x16 : Iop_QShl8x8;
+                     op_sub = Q ? Iop_Sub8x16 : Iop_Sub8x8;
+                     op_add = Q ? Iop_Add8x16 : Iop_Add8x8;
+                     op_rev = Q ? Iop_Shr8x16 : Iop_Shr8x8;
+                     op_shrn = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
+                     op_shln = Q ? Iop_ShlN8x16 : Iop_ShlN8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_QShl16x8 : Iop_QShl16x4;
+                     op_sub = Q ? Iop_Sub16x8 : Iop_Sub16x4;
+                     op_add = Q ? Iop_Add16x8 : Iop_Add16x4;
+                     op_rev = Q ? Iop_Shr16x8 : Iop_Shr16x4;
+                     op_shrn = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
+                     op_shln = Q ? Iop_ShlN16x8 : Iop_ShlN16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_QShl32x4 : Iop_QShl32x2;
+                     op_sub = Q ? Iop_Sub32x4 : Iop_Sub32x2;
+                     op_add = Q ? Iop_Add32x4 : Iop_Add32x2;
+                     op_rev = Q ? Iop_Shr32x4 : Iop_Shr32x2;
+                     op_shrn = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
+                     op_shln = Q ? Iop_ShlN32x4 : Iop_ShlN32x2;
+                     break;
+                  case 3:
+                     op = Q ? Iop_QShl64x2 : Iop_QShl64x1;
+                     op_sub = Q ? Iop_Sub64x2 : Iop_Sub64;
+                     op_add = Q ? Iop_Add64x2 : Iop_Add64;
+                     op_rev = Q ? Iop_Shr64x2 : Iop_Shr64;
+                     op_shrn = Q ? Iop_ShrN64x2 : Iop_Shr64;
+                     op_shln = Q ? Iop_ShlN64x2 : Iop_Shl64;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_QSal8x16 : Iop_QSal8x8;
+                     op_sub = Q ? Iop_Sub8x16 : Iop_Sub8x8;
+                     op_add = Q ? Iop_Add8x16 : Iop_Add8x8;
+                     op_rev = Q ? Iop_Sar8x16 : Iop_Sar8x8;
+                     op_shrn = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
+                     op_shln = Q ? Iop_ShlN8x16 : Iop_ShlN8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_QSal16x8 : Iop_QSal16x4;
+                     op_sub = Q ? Iop_Sub16x8 : Iop_Sub16x4;
+                     op_add = Q ? Iop_Add16x8 : Iop_Add16x4;
+                     op_rev = Q ? Iop_Sar16x8 : Iop_Sar16x4;
+                     op_shrn = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
+                     op_shln = Q ? Iop_ShlN16x8 : Iop_ShlN16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_QSal32x4 : Iop_QSal32x2;
+                     op_sub = Q ? Iop_Sub32x4 : Iop_Sub32x2;
+                     op_add = Q ? Iop_Add32x4 : Iop_Add32x2;
+                     op_rev = Q ? Iop_Sar32x4 : Iop_Sar32x2;
+                     op_shrn = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
+                     op_shln = Q ? Iop_ShlN32x4 : Iop_ShlN32x2;
+                     break;
+                  case 3:
+                     op = Q ? Iop_QSal64x2 : Iop_QSal64x1;
+                     op_sub = Q ? Iop_Sub64x2 : Iop_Sub64;
+                     op_add = Q ? Iop_Add64x2 : Iop_Add64;
+                     op_rev = Q ? Iop_Sar64x2 : Iop_Sar64;
+                     op_shrn = Q ? Iop_ShrN64x2 : Iop_Shr64;
+                     op_shln = Q ? Iop_ShlN64x2 : Iop_Shl64;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            }
+            if (Q) {
+               tmp = newTemp(Ity_V128);
+               shval = newTemp(Ity_V128);
+               mask = newTemp(Ity_V128);
+            } else {
+               tmp = newTemp(Ity_I64);
+               shval = newTemp(Ity_I64);
+               mask = newTemp(Ity_I64);
+            }
+            /* Only least significant byte from second argument is used.
+               Copy this byte to the whole vector element. */
+            assign(shval, binop(op_shrn,
+                                binop(op_shln,
+                                       mkexpr(arg_n),
+                                       mkU8((8 << size) - 8)),
+                                mkU8((8 << size) - 8)));
+            for (i = 0; i < size; i++) {
+               old_shval = shval;
+               shval = newTemp(Q ? Ity_V128 : Ity_I64);
+               assign(shval, binop(Q ? Iop_OrV128 : Iop_Or64,
+                                   mkexpr(old_shval),
+                                   binop(op_shln,
+                                         mkexpr(old_shval),
+                                         mkU8(8 << i))));
+            }
+            /* Compute the result */
+            assign(round, binop(Q ? Iop_AndV128 : Iop_And64,
+                                binop(op,
+                                      mkexpr(arg_m),
+                                      binop(op_add,
+                                            mkexpr(arg_n),
+                                            mkexpr(imm_val))),
+                                binop(Q ? Iop_AndV128 : Iop_And64,
+                                      mkexpr(imm_val),
+                                      binop(cmp_gt,
+                                            Q ? mkU128(0) : mkU64(0),
+                                            mkexpr(arg_n)))));
+            assign(res, binop(op_add,
+                              binop(op, mkexpr(arg_m), mkexpr(arg_n)),
+                              mkexpr(round)));
+#ifndef DISABLE_QC_FLAG
+            /* If shift is greater or equal to the element size and element is
+               non-zero, then QC flag should be set. */
+            esize = (8 << size) - 1;
+            esize = (esize <<  8) | esize;
+            esize = (esize << 16) | esize;
+            esize = (esize << 32) | esize;
+            setFlag_QC(binop(Q ? Iop_AndV128 : Iop_And64,
+                             binop(cmp_gt, mkexpr(shval),
+                                           Q ? mkU128(esize) : mkU64(esize)),
+                             unop(cmp_neq, mkexpr(arg_m))),
+                       Q ? mkU128(0) : mkU64(0),
+                       Q, condT);
+            /* Othervise QC flag should be set if shift value is positive and
+               result beign rightshifted the same value is not equal to left
+               argument. */
+            assign(mask, binop(cmp_gt, mkexpr(shval),
+                               Q ? mkU128(0) : mkU64(0)));
+            if (!Q && size == 3)
+               assign(tmp, binop(op_rev, mkexpr(res),
+                                         unop(Iop_64to8, mkexpr(arg_n))));
+            else
+               assign(tmp, binop(op_rev, mkexpr(res), mkexpr(arg_n)));
+            setFlag_QC(binop(Q ? Iop_AndV128 : Iop_And64,
+                             mkexpr(tmp), mkexpr(mask)),
+                       binop(Q ? Iop_AndV128 : Iop_And64,
+                             mkexpr(arg_m), mkexpr(mask)),
+                       Q, condT);
+#endif
+            DIP("vqrshl.%c%u %c%u, %c%u, %c%u\n",
+                U ? 'u' : 's', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg, Q ? 'q' : 'd',
+                nreg);
+         }
+         break;
+      case 6:
+         /* VMAX, VMIN  */
+         if (B == 0) {
+            /* VMAX */
+            IROp op;
+            if (U == 0) {
+               switch (size) {
+                  case 0: op = Q ? Iop_Max8Sx16 : Iop_Max8Sx8; break;
+                  case 1: op = Q ? Iop_Max16Sx8 : Iop_Max16Sx4; break;
+                  case 2: op = Q ? Iop_Max32Sx4 : Iop_Max32Sx2; break;
+                  case 3: return False;
+                  default: vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0: op = Q ? Iop_Max8Ux16 : Iop_Max8Ux8; break;
+                  case 1: op = Q ? Iop_Max16Ux8 : Iop_Max16Ux4; break;
+                  case 2: op = Q ? Iop_Max32Ux4 : Iop_Max32Ux2; break;
+                  case 3: return False;
+                  default: vassert(0);
+               }
+            }
+            assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+            DIP("vmax.%c%u %c%u, %c%u, %c%u\n",
+                U ? 'u' : 's', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd',
+                mreg);
+         } else {
+            /* VMIN */
+            IROp op;
+            if (U == 0) {
+               switch (size) {
+                  case 0: op = Q ? Iop_Min8Sx16 : Iop_Min8Sx8; break;
+                  case 1: op = Q ? Iop_Min16Sx8 : Iop_Min16Sx4; break;
+                  case 2: op = Q ? Iop_Min32Sx4 : Iop_Min32Sx2; break;
+                  case 3: return False;
+                  default: vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0: op = Q ? Iop_Min8Ux16 : Iop_Min8Ux8; break;
+                  case 1: op = Q ? Iop_Min16Ux8 : Iop_Min16Ux4; break;
+                  case 2: op = Q ? Iop_Min32Ux4 : Iop_Min32Ux2; break;
+                  case 3: return False;
+                  default: vassert(0);
+               }
+            }
+            assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+            DIP("vmin.%c%u %c%u, %c%u, %c%u\n",
+                U ? 'u' : 's', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd',
+                mreg);
+         }
+         break;
+      case 7:
+         if (B == 0) {
+            /* VABD */
+            IROp op_cmp, op_sub;
+            IRTemp cond;
+            if ((theInstr >> 23) & 1) {
+               vpanic("VABDL should not be in dis_neon_data_3same\n");
+            }
+            if (Q) {
+               switch (size) {
+                  case 0:
+                     op_cmp = U ? Iop_CmpGT8Ux16 : Iop_CmpGT8Sx16;
+                     op_sub = Iop_Sub8x16;
+                     break;
+                  case 1:
+                     op_cmp = U ? Iop_CmpGT16Ux8 : Iop_CmpGT16Sx8;
+                     op_sub = Iop_Sub16x8;
+                     break;
+                  case 2:
+                     op_cmp = U ? Iop_CmpGT32Ux4 : Iop_CmpGT32Sx4;
+                     op_sub = Iop_Sub32x4;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0:
+                     op_cmp = U ? Iop_CmpGT8Ux8 : Iop_CmpGT8Sx8;
+                     op_sub = Iop_Sub8x8;
+                     break;
+                  case 1:
+                     op_cmp = U ? Iop_CmpGT16Ux4 : Iop_CmpGT16Sx4;
+                     op_sub = Iop_Sub16x4;
+                     break;
+                  case 2:
+                     op_cmp = U ? Iop_CmpGT32Ux2 : Iop_CmpGT32Sx2;
+                     op_sub = Iop_Sub32x2;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+            }
+            if (Q) {
+               cond = newTemp(Ity_V128);
+            } else {
+               cond = newTemp(Ity_I64);
+            }
+            assign(cond, binop(op_cmp, mkexpr(arg_n), mkexpr(arg_m)));
+            assign(res, binop(Q ? Iop_OrV128 : Iop_Or64,
+                              binop(Q ? Iop_AndV128 : Iop_And64,
+                                    binop(op_sub, mkexpr(arg_n),
+                                                  mkexpr(arg_m)),
+                                    mkexpr(cond)),
+                              binop(Q ? Iop_AndV128 : Iop_And64,
+                                    binop(op_sub, mkexpr(arg_m),
+                                                  mkexpr(arg_n)),
+                                    unop(Q ? Iop_NotV128 : Iop_Not64,
+                                         mkexpr(cond)))));
+            DIP("vabd.%c%u %c%u, %c%u, %c%u\n",
+                U ? 'u' : 's', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd',
+                mreg);
+         } else {
+            /* VABA */
+            IROp op_cmp, op_sub, op_add;
+            IRTemp cond, acc, tmp;
+            if ((theInstr >> 23) & 1) {
+               vpanic("VABAL should not be in dis_neon_data_3same");
+            }
+            if (Q) {
+               switch (size) {
+                  case 0:
+                     op_cmp = U ? Iop_CmpGT8Ux16 : Iop_CmpGT8Sx16;
+                     op_sub = Iop_Sub8x16;
+                     op_add = Iop_Add8x16;
+                     break;
+                  case 1:
+                     op_cmp = U ? Iop_CmpGT16Ux8 : Iop_CmpGT16Sx8;
+                     op_sub = Iop_Sub16x8;
+                     op_add = Iop_Add16x8;
+                     break;
+                  case 2:
+                     op_cmp = U ? Iop_CmpGT32Ux4 : Iop_CmpGT32Sx4;
+                     op_sub = Iop_Sub32x4;
+                     op_add = Iop_Add32x4;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0:
+                     op_cmp = U ? Iop_CmpGT8Ux8 : Iop_CmpGT8Sx8;
+                     op_sub = Iop_Sub8x8;
+                     op_add = Iop_Add8x8;
+                     break;
+                  case 1:
+                     op_cmp = U ? Iop_CmpGT16Ux4 : Iop_CmpGT16Sx4;
+                     op_sub = Iop_Sub16x4;
+                     op_add = Iop_Add16x4;
+                     break;
+                  case 2:
+                     op_cmp = U ? Iop_CmpGT32Ux2 : Iop_CmpGT32Sx2;
+                     op_sub = Iop_Sub32x2;
+                     op_add = Iop_Add32x2;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+            }
+            if (Q) {
+               cond = newTemp(Ity_V128);
+               acc = newTemp(Ity_V128);
+               tmp = newTemp(Ity_V128);
+               assign(acc, getQReg(dreg));
+            } else {
+               cond = newTemp(Ity_I64);
+               acc = newTemp(Ity_I64);
+               tmp = newTemp(Ity_I64);
+               assign(acc, getDRegI64(dreg));
+            }
+            assign(cond, binop(op_cmp, mkexpr(arg_n), mkexpr(arg_m)));
+            assign(tmp, binop(Q ? Iop_OrV128 : Iop_Or64,
+                              binop(Q ? Iop_AndV128 : Iop_And64,
+                                    binop(op_sub, mkexpr(arg_n),
+                                                  mkexpr(arg_m)),
+                                    mkexpr(cond)),
+                              binop(Q ? Iop_AndV128 : Iop_And64,
+                                    binop(op_sub, mkexpr(arg_m),
+                                                  mkexpr(arg_n)),
+                                    unop(Q ? Iop_NotV128 : Iop_Not64,
+                                         mkexpr(cond)))));
+            assign(res, binop(op_add, mkexpr(acc), mkexpr(tmp)));
+            DIP("vaba.%c%u %c%u, %c%u, %c%u\n",
+                U ? 'u' : 's', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd',
+                mreg);
+         }
+         break;
+      case 8:
+         if (B == 0) {
+            IROp op;
+            if (U == 0) {
+               /* VADD  */
+               switch (size) {
+                  case 0: op = Q ? Iop_Add8x16 : Iop_Add8x8; break;
+                  case 1: op = Q ? Iop_Add16x8 : Iop_Add16x4; break;
+                  case 2: op = Q ? Iop_Add32x4 : Iop_Add32x2; break;
+                  case 3: op = Q ? Iop_Add64x2 : Iop_Add64; break;
+                  default: vassert(0);
+               }
+               DIP("vadd.i%u %c%u, %c%u, %c%u\n",
+                   8 << size, Q ? 'q' : 'd',
+                   dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+            } else {
+               /* VSUB  */
+               switch (size) {
+                  case 0: op = Q ? Iop_Sub8x16 : Iop_Sub8x8; break;
+                  case 1: op = Q ? Iop_Sub16x8 : Iop_Sub16x4; break;
+                  case 2: op = Q ? Iop_Sub32x4 : Iop_Sub32x2; break;
+                  case 3: op = Q ? Iop_Sub64x2 : Iop_Sub64; break;
+                  default: vassert(0);
+               }
+               DIP("vsub.i%u %c%u, %c%u, %c%u\n",
+                   8 << size, Q ? 'q' : 'd',
+                   dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+            }
+            assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+         } else {
+            IROp op;
+            switch (size) {
+               case 0: op = Q ? Iop_CmpNEZ8x16 : Iop_CmpNEZ8x8; break;
+               case 1: op = Q ? Iop_CmpNEZ16x8 : Iop_CmpNEZ16x4; break;
+               case 2: op = Q ? Iop_CmpNEZ32x4 : Iop_CmpNEZ32x2; break;
+               case 3: op = Q ? Iop_CmpNEZ64x2 : Iop_CmpwNEZ64; break;
+               default: vassert(0);
+            }
+            if (U == 0) {
+               /* VTST  */
+               assign(res, unop(op, binop(Q ? Iop_AndV128 : Iop_And64,
+                                          mkexpr(arg_n),
+                                          mkexpr(arg_m))));
+               DIP("vtst.%u %c%u, %c%u, %c%u\n",
+                   8 << size, Q ? 'q' : 'd',
+                   dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+            } else {
+               /* VCEQ  */
+               assign(res, unop(Q ? Iop_NotV128 : Iop_Not64,
+                                unop(op,
+                                     binop(Q ? Iop_XorV128 : Iop_Xor64,
+                                           mkexpr(arg_n),
+                                           mkexpr(arg_m)))));
+               DIP("vceq.i%u %c%u, %c%u, %c%u\n",
+                   8 << size, Q ? 'q' : 'd',
+                   dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+            }
+         }
+         break;
+      case 9:
+         if (B == 0) {
+            /* VMLA, VMLS (integer) */
+            IROp op, op2;
+            UInt P = (theInstr >> 24) & 1;
+            if (P) {
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_Mul8x16 : Iop_Mul8x8;
+                     op2 = Q ? Iop_Sub8x16 : Iop_Sub8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_Mul16x8 : Iop_Mul16x4;
+                     op2 = Q ? Iop_Sub16x8 : Iop_Sub16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_Mul32x4 : Iop_Mul32x2;
+                     op2 = Q ? Iop_Sub32x4 : Iop_Sub32x2;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_Mul8x16 : Iop_Mul8x8;
+                     op2 = Q ? Iop_Add8x16 : Iop_Add8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_Mul16x8 : Iop_Mul16x4;
+                     op2 = Q ? Iop_Add16x8 : Iop_Add16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_Mul32x4 : Iop_Mul32x2;
+                     op2 = Q ? Iop_Add32x4 : Iop_Add32x2;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+            }
+            assign(res, binop(op2,
+                              Q ? getQReg(dreg) : getDRegI64(dreg),
+                              binop(op, mkexpr(arg_n), mkexpr(arg_m))));
+            DIP("vml%c.i%u %c%u, %c%u, %c%u\n",
+                P ? 's' : 'a', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd',
+                mreg);
+         } else {
+            /* VMUL */
+            IROp op;
+            UInt P = (theInstr >> 24) & 1;
+            if (P) {
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_PolynomialMul8x16 : Iop_PolynomialMul8x8;
+                     break;
+                  case 1: case 2: case 3: return False;
+                  default: vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0: op = Q ? Iop_Mul8x16 : Iop_Mul8x8; break;
+                  case 1: op = Q ? Iop_Mul16x8 : Iop_Mul16x4; break;
+                  case 2: op = Q ? Iop_Mul32x4 : Iop_Mul32x2; break;
+                  case 3: return False;
+                  default: vassert(0);
+               }
+            }
+            assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+            DIP("vmul.%c%u %c%u, %c%u, %c%u\n",
+                P ? 'p' : 'i', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd',
+                mreg);
+         }
+         break;
+      case 10: {
+         /* VPMAX, VPMIN  */
+         UInt P = (theInstr >> 4) & 1;
+         IROp op;
+         if (Q)
+            return False;
+         if (P) {
+            switch (size) {
+               case 0: op = U ? Iop_PwMin8Ux8  : Iop_PwMin8Sx8; break;
+               case 1: op = U ? Iop_PwMin16Ux4 : Iop_PwMin16Sx4; break;
+               case 2: op = U ? Iop_PwMin32Ux2 : Iop_PwMin32Sx2; break;
+               case 3: return False;
+               default: vassert(0);
+            }
+         } else {
+            switch (size) {
+               case 0: op = U ? Iop_PwMax8Ux8  : Iop_PwMax8Sx8; break;
+               case 1: op = U ? Iop_PwMax16Ux4 : Iop_PwMax16Sx4; break;
+               case 2: op = U ? Iop_PwMax32Ux2 : Iop_PwMax32Sx2; break;
+               case 3: return False;
+               default: vassert(0);
+            }
+         }
+         assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+         DIP("vp%s.%c%u %c%u, %c%u, %c%u\n",
+             P ? "min" : "max", U ? 'u' : 's',
+             8 << size, Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg,
+             Q ? 'q' : 'd', mreg);
+         break;
+      }
+      case 11:
+         if (B == 0) {
+            if (U == 0) {
+               /* VQDMULH  */
+               IROp op ,op2;
+               ULong imm;
+               switch (size) {
+                  case 0: case 3:
+                     return False;
+                  case 1:
+                     op = Q ? Iop_QDMulHi16Sx8 : Iop_QDMulHi16Sx4;
+                     op2 = Q ? Iop_CmpEQ16x8 : Iop_CmpEQ16x4;
+                     imm = 1LL << 15;
+                     imm = (imm << 16) | imm;
+                     imm = (imm << 32) | imm;
+                     break;
+                  case 2:
+                     op = Q ? Iop_QDMulHi32Sx4 : Iop_QDMulHi32Sx2;
+                     op2 = Q ? Iop_CmpEQ32x4 : Iop_CmpEQ32x2;
+                     imm = 1LL << 31;
+                     imm = (imm << 32) | imm;
+                     break;
+                  default:
+                     vassert(0);
+               }
+               assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+#ifndef DISABLE_QC_FLAG
+               setFlag_QC(binop(Q ? Iop_AndV128 : Iop_And64,
+                                binop(op2, mkexpr(arg_n),
+                                           Q ? mkU128(imm) : mkU64(imm)),
+                                binop(op2, mkexpr(arg_m),
+                                           Q ? mkU128(imm) : mkU64(imm))),
+                          Q ? mkU128(0) : mkU64(0),
+                          Q, condT);
+#endif
+               DIP("vqdmulh.s%u %c%u, %c%u, %c%u\n",
+                   8 << size, Q ? 'q' : 'd',
+                   dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+            } else {
+               /* VQRDMULH */
+               IROp op ,op2;
+               ULong imm;
+               switch(size) {
+                  case 0: case 3:
+                     return False;
+                  case 1:
+                     imm = 1LL << 15;
+                     imm = (imm << 16) | imm;
+                     imm = (imm << 32) | imm;
+                     op = Q ? Iop_QRDMulHi16Sx8 : Iop_QRDMulHi16Sx4;
+                     op2 = Q ? Iop_CmpEQ16x8 : Iop_CmpEQ16x4;
+                     break;
+                  case 2:
+                     imm = 1LL << 31;
+                     imm = (imm << 32) | imm;
+                     op = Q ? Iop_QRDMulHi32Sx4 : Iop_QRDMulHi32Sx2;
+                     op2 = Q ? Iop_CmpEQ32x4 : Iop_CmpEQ32x2;
+                     break;
+                  default:
+                     vassert(0);
+               }
+               assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+#ifndef DISABLE_QC_FLAG
+               setFlag_QC(binop(Q ? Iop_AndV128 : Iop_And64,
+                                binop(op2, mkexpr(arg_n),
+                                           Q ? mkU128(imm) : mkU64(imm)),
+                                binop(op2, mkexpr(arg_m),
+                                           Q ? mkU128(imm) : mkU64(imm))),
+                          Q ? mkU128(0) : mkU64(0),
+                          Q, condT);
+#endif
+               DIP("vqrdmulh.s%u %c%u, %c%u, %c%u\n",
+                   8 << size, Q ? 'q' : 'd',
+                   dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+            }
+         } else {
+            if (U == 0) {
+               /* VPADD */
+               IROp op;
+               if (Q)
+                  return False;
+               switch (size) {
+                  case 0: op = Q ? Iop_PwAdd8x16 : Iop_PwAdd8x8;  break;
+                  case 1: op = Q ? Iop_PwAdd16x8 : Iop_PwAdd16x4; break;
+                  case 2: op = Q ? Iop_PwAdd32x4 : Iop_PwAdd32x2; break;
+                  case 3: return False;
+                  default: vassert(0);
+               }
+               assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+               DIP("vpadd.i%d %c%u, %c%u, %c%u\n",
+                   8 << size, Q ? 'q' : 'd',
+                   dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+            }
+         }
+         break;
+      /* Starting from here these are FP SIMD cases */
+      case 13:
+         if (B == 0) {
+            IROp op;
+            if (U == 0) {
+               if ((C >> 1) == 0) {
+                  /* VADD  */
+                  op = Q ? Iop_Add32Fx4 : Iop_Add32Fx2 ;
+                  DIP("vadd.f32 %c%u, %c%u, %c%u\n",
+                      Q ? 'q' : 'd', dreg,
+                      Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+               } else {
+                  /* VSUB  */
+                  op = Q ? Iop_Sub32Fx4 : Iop_Sub32Fx2 ;
+                  DIP("vsub.f32 %c%u, %c%u, %c%u\n",
+                      Q ? 'q' : 'd', dreg,
+                      Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+               }
+            } else {
+               if ((C >> 1) == 0) {
+                  /* VPADD */
+                  if (Q)
+                     return False;
+                  op = Iop_PwAdd32Fx2;
+                  DIP("vpadd.f32 d%u, d%u, d%u\n", dreg, nreg, mreg);
+               } else {
+                  /* VABD  */
+                  if (Q) {
+                     assign(res, unop(Iop_Abs32Fx4,
+                                      binop(Iop_Sub32Fx4,
+                                            mkexpr(arg_n),
+                                            mkexpr(arg_m))));
+                  } else {
+                     assign(res, unop(Iop_Abs32Fx2,
+                                      binop(Iop_Sub32Fx2,
+                                            mkexpr(arg_n),
+                                            mkexpr(arg_m))));
+                  }
+                  DIP("vabd.f32 %c%u, %c%u, %c%u\n",
+                      Q ? 'q' : 'd', dreg,
+                      Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+                  break;
+               }
+            }
+            assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+         } else {
+            if (U == 0) {
+               /* VMLA, VMLS  */
+               IROp op, op2;
+               UInt P = (theInstr >> 21) & 1;
+               if (P) {
+                  switch (size & 1) {
+                     case 0:
+                        op = Q ? Iop_Mul32Fx4 : Iop_Mul32Fx2;
+                        op2 = Q ? Iop_Sub32Fx4 : Iop_Sub32Fx2;
+                        break;
+                     case 1: return False;
+                     default: vassert(0);
+                  }
+               } else {
+                  switch (size & 1) {
+                     case 0:
+                        op = Q ? Iop_Mul32Fx4 : Iop_Mul32Fx2;
+                        op2 = Q ? Iop_Add32Fx4 : Iop_Add32Fx2;
+                        break;
+                     case 1: return False;
+                     default: vassert(0);
+                  }
+               }
+               assign(res, binop(op2,
+                                 Q ? getQReg(dreg) : getDRegI64(dreg),
+                                 binop(op, mkexpr(arg_n), mkexpr(arg_m))));
+
+               DIP("vml%c.f32 %c%u, %c%u, %c%u\n",
+                   P ? 's' : 'a', Q ? 'q' : 'd',
+                   dreg, Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+            } else {
+               /* VMUL  */
+               IROp op;
+               if ((C >> 1) != 0)
+                  return False;
+               op = Q ? Iop_Mul32Fx4 : Iop_Mul32Fx2 ;
+               assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+               DIP("vmul.f32 %c%u, %c%u, %c%u\n",
+                   Q ? 'q' : 'd', dreg,
+                   Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+            }
+         }
+         break;
+      case 14:
+         if (B == 0) {
+            if (U == 0) {
+               if ((C >> 1) == 0) {
+                  /* VCEQ  */
+                  IROp op;
+                  if ((theInstr >> 20) & 1)
+                     return False;
+                  op = Q ? Iop_CmpEQ32Fx4 : Iop_CmpEQ32Fx2;
+                  assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+                  DIP("vceq.f32 %c%u, %c%u, %c%u\n",
+                      Q ? 'q' : 'd', dreg,
+                      Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+               } else {
+                  return False;
+               }
+            } else {
+               if ((C >> 1) == 0) {
+                  /* VCGE  */
+                  IROp op;
+                  if ((theInstr >> 20) & 1)
+                     return False;
+                  op = Q ? Iop_CmpGE32Fx4 : Iop_CmpGE32Fx2;
+                  assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+                  DIP("vcge.f32 %c%u, %c%u, %c%u\n",
+                      Q ? 'q' : 'd', dreg,
+                      Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+               } else {
+                  /* VCGT  */
+                  IROp op;
+                  if ((theInstr >> 20) & 1)
+                     return False;
+                  op = Q ? Iop_CmpGT32Fx4 : Iop_CmpGT32Fx2;
+                  assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+                  DIP("vcgt.f32 %c%u, %c%u, %c%u\n",
+                      Q ? 'q' : 'd', dreg,
+                      Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+               }
+            }
+         } else {
+            if (U == 1) {
+               /* VACGE, VACGT */
+               UInt op_bit = (theInstr >> 21) & 1;
+               IROp op, op2;
+               op2 = Q ? Iop_Abs32Fx4 : Iop_Abs32Fx2;
+               if (op_bit) {
+                  op = Q ? Iop_CmpGT32Fx4 : Iop_CmpGT32Fx2;
+                  assign(res, binop(op,
+                                    unop(op2, mkexpr(arg_n)),
+                                    unop(op2, mkexpr(arg_m))));
+               } else {
+                  op = Q ? Iop_CmpGE32Fx4 : Iop_CmpGE32Fx2;
+                  assign(res, binop(op,
+                                    unop(op2, mkexpr(arg_n)),
+                                    unop(op2, mkexpr(arg_m))));
+               }
+               DIP("vacg%c.f32 %c%u, %c%u, %c%u\n", op_bit ? 't' : 'e',
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg,
+                   Q ? 'q' : 'd', mreg);
+            }
+         }
+         break;
+      case 15:
+         if (B == 0) {
+            if (U == 0) {
+               /* VMAX, VMIN  */
+               IROp op;
+               if ((theInstr >> 20) & 1)
+                  return False;
+               if ((theInstr >> 21) & 1) {
+                  op = Q ? Iop_Min32Fx4 : Iop_Min32Fx2;
+                  DIP("vmin.f32 %c%u, %c%u, %c%u\n", Q ? 'q' : 'd', dreg,
+                      Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+               } else {
+                  op = Q ? Iop_Max32Fx4 : Iop_Max32Fx2;
+                  DIP("vmax.f32 %c%u, %c%u, %c%u\n", Q ? 'q' : 'd', dreg,
+                      Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+               }
+               assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+            } else {
+               /* VPMAX, VPMIN   */
+               IROp op;
+               if (Q)
+                  return False;
+               if ((theInstr >> 20) & 1)
+                  return False;
+               if ((theInstr >> 21) & 1) {
+                  op = Iop_PwMin32Fx2;
+                  DIP("vpmin.f32 d%u, d%u, d%u\n", dreg, nreg, mreg);
+               } else {
+                  op = Iop_PwMax32Fx2;
+                  DIP("vpmax.f32 d%u, d%u, d%u\n", dreg, nreg, mreg);
+               }
+               assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+            }
+         } else {
+            if (U == 0) {
+               if ((C >> 1) == 0) {
+                  /* VRECPS */
+                  if ((theInstr >> 20) & 1)
+                     return False;
+                  assign(res, binop(Q ? Iop_Recps32Fx4 : Iop_Recps32Fx2,
+                                    mkexpr(arg_n),
+                                    mkexpr(arg_m)));
+                  DIP("vrecps.f32 %c%u, %c%u, %c%u\n", Q ? 'q' : 'd', dreg,
+                      Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+               } else {
+                  /* VRSQRTS  */
+                  if ((theInstr >> 20) & 1)
+                     return False;
+                  assign(res, binop(Q ? Iop_Rsqrts32Fx4 : Iop_Rsqrts32Fx2,
+                                    mkexpr(arg_n),
+                                    mkexpr(arg_m)));
+                  DIP("vrsqrts.f32 %c%u, %c%u, %c%u\n", Q ? 'q' : 'd', dreg,
+                      Q ? 'q' : 'd', nreg, Q ? 'q' : 'd', mreg);
+               }
+            }
+         }
+         break;
+   }
+
+   if (Q) {
+      putQReg(dreg, mkexpr(res), condT);
+   } else {
+      putDRegI64(dreg, mkexpr(res), condT);
+   }
+
+   return True;
+}
+
+/* A7.4.2 Three registers of different length */
+static
+Bool dis_neon_data_3diff ( UInt theInstr, IRTemp condT )
+{
+   UInt A = (theInstr >> 8) & 0xf;
+   UInt B = (theInstr >> 20) & 3;
+   UInt U = (theInstr >> 24) & 1;
+   UInt P = (theInstr >> 9) & 1;
+   UInt mreg = get_neon_m_regno(theInstr);
+   UInt nreg = get_neon_n_regno(theInstr);
+   UInt dreg = get_neon_d_regno(theInstr);
+   UInt size = B;
+   ULong imm;
+   IRTemp res, arg_m, arg_n, cond, tmp;
+   IROp cvt, cvt2, cmp, op, op2, sh, add;
+   switch (A) {
+      case 0: case 1: case 2: case 3:
+         /* VADDL, VADDW, VSUBL, VSUBW */
+         if (dreg & 1)
+            return False;
+         dreg >>= 1;
+         size = B;
+         switch (size) {
+            case 0:
+               cvt = U ? Iop_Longen8Ux8 : Iop_Longen8Sx8;
+               op = (A & 2) ? Iop_Sub16x8 : Iop_Add16x8;
+               break;
+            case 1:
+               cvt = U ? Iop_Longen16Ux4 : Iop_Longen16Sx4;
+               op = (A & 2) ? Iop_Sub32x4 : Iop_Add32x4;
+               break;
+            case 2:
+               cvt = U ? Iop_Longen32Ux2 : Iop_Longen32Sx2;
+               op = (A & 2) ? Iop_Sub64x2 : Iop_Add64x2;
+               break;
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         arg_n = newTemp(Ity_V128);
+         arg_m = newTemp(Ity_V128);
+         if (A & 1) {
+            if (nreg & 1)
+               return False;
+            nreg >>= 1;
+            assign(arg_n, getQReg(nreg));
+         } else {
+            assign(arg_n, unop(cvt, getDRegI64(nreg)));
+         }
+         assign(arg_m, unop(cvt, getDRegI64(mreg)));
+         putQReg(dreg, binop(op, mkexpr(arg_n), mkexpr(arg_m)),
+                       condT);
+         DIP("v%s%c.%c%u q%u, %c%u, d%u\n", (A & 2) ? "sub" : "add",
+             (A & 1) ? 'w' : 'l', U ? 'u' : 's', 8 << size, dreg,
+             (A & 1) ? 'q' : 'd', nreg, mreg);
+         return True;
+      case 4:
+         /* VADDHN, VRADDHN */
+         if (mreg & 1)
+            return False;
+         mreg >>= 1;
+         if (nreg & 1)
+            return False;
+         nreg >>= 1;
+         size = B;
+         switch (size) {
+            case 0:
+               op = Iop_Add16x8;
+               cvt = Iop_Shorten16x8;
+               sh = Iop_ShrN16x8;
+               imm = 1U << 7;
+               imm = (imm << 16) | imm;
+               imm = (imm << 32) | imm;
+               break;
+            case 1:
+               op = Iop_Add32x4;
+               cvt = Iop_Shorten32x4;
+               sh = Iop_ShrN32x4;
+               imm = 1U << 15;
+               imm = (imm << 32) | imm;
+               break;
+            case 2:
+               op = Iop_Add64x2;
+               cvt = Iop_Shorten64x2;
+               sh = Iop_ShrN64x2;
+               imm = 1U << 31;
+               break;
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         tmp = newTemp(Ity_V128);
+         res = newTemp(Ity_V128);
+         assign(tmp, binop(op, getQReg(nreg), getQReg(mreg)));
+         if (U) {
+            /* VRADDHN */
+            assign(res, binop(op, mkexpr(tmp),
+                     binop(Iop_64HLtoV128, mkU64(imm), mkU64(imm))));
+         } else {
+            assign(res, mkexpr(tmp));
+         }
+         putDRegI64(dreg, unop(cvt, binop(sh, mkexpr(res), mkU8(8 << size))),
+                    condT);
+         DIP("v%saddhn.i%u d%u, q%u, q%u\n", U ? "r" : "", 16 << size, dreg,
+             nreg, mreg);
+         return True;
+      case 5:
+         /* VABAL */
+         if (!((theInstr >> 23) & 1)) {
+            vpanic("VABA should not be in dis_neon_data_3diff\n");
+         }
+         if (dreg & 1)
+            return False;
+         dreg >>= 1;
+         switch (size) {
+            case 0:
+               cmp = U ? Iop_CmpGT8Ux8 : Iop_CmpGT8Sx8;
+               cvt = U ? Iop_Longen8Ux8 : Iop_Longen8Sx8;
+               cvt2 = Iop_Longen8Sx8;
+               op = Iop_Sub16x8;
+               op2 = Iop_Add16x8;
+               break;
+            case 1:
+               cmp = U ? Iop_CmpGT16Ux4 : Iop_CmpGT16Sx4;
+               cvt = U ? Iop_Longen16Ux4 : Iop_Longen16Sx4;
+               cvt2 = Iop_Longen16Sx4;
+               op = Iop_Sub32x4;
+               op2 = Iop_Add32x4;
+               break;
+            case 2:
+               cmp = U ? Iop_CmpGT32Ux2 : Iop_CmpGT32Sx2;
+               cvt = U ? Iop_Longen32Ux2 : Iop_Longen32Sx2;
+               cvt2 = Iop_Longen32Sx2;
+               op = Iop_Sub64x2;
+               op2 = Iop_Add64x2;
+               break;
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         arg_n = newTemp(Ity_V128);
+         arg_m = newTemp(Ity_V128);
+         cond = newTemp(Ity_V128);
+         res = newTemp(Ity_V128);
+         assign(arg_n, unop(cvt, getDRegI64(nreg)));
+         assign(arg_m, unop(cvt, getDRegI64(mreg)));
+         assign(cond, unop(cvt2, binop(cmp, getDRegI64(nreg),
+                                            getDRegI64(mreg))));
+         assign(res, binop(op2,
+                           binop(Iop_OrV128,
+                                 binop(Iop_AndV128,
+                                       binop(op, mkexpr(arg_n), mkexpr(arg_m)),
+                                       mkexpr(cond)),
+                                 binop(Iop_AndV128,
+                                       binop(op, mkexpr(arg_m), mkexpr(arg_n)),
+                                       unop(Iop_NotV128, mkexpr(cond)))),
+                           getQReg(dreg)));
+         putQReg(dreg, mkexpr(res), condT);
+         DIP("vabal.%c%u q%u, d%u, d%u\n", U ? 'u' : 's', 8 << size, dreg,
+             nreg, mreg);
+         return True;
+      case 6:
+         /* VSUBHN, VRSUBHN */
+         if (mreg & 1)
+            return False;
+         mreg >>= 1;
+         if (nreg & 1)
+            return False;
+         nreg >>= 1;
+         size = B;
+         switch (size) {
+            case 0:
+               op = Iop_Sub16x8;
+               op2 = Iop_Add16x8;
+               cvt = Iop_Shorten16x8;
+               sh = Iop_ShrN16x8;
+               imm = 1U << 7;
+               imm = (imm << 16) | imm;
+               imm = (imm << 32) | imm;
+               break;
+            case 1:
+               op = Iop_Sub32x4;
+               op2 = Iop_Add32x4;
+               cvt = Iop_Shorten32x4;
+               sh = Iop_ShrN32x4;
+               imm = 1U << 15;
+               imm = (imm << 32) | imm;
+               break;
+            case 2:
+               op = Iop_Sub64x2;
+               op2 = Iop_Add64x2;
+               cvt = Iop_Shorten64x2;
+               sh = Iop_ShrN64x2;
+               imm = 1U << 31;
+               break;
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         tmp = newTemp(Ity_V128);
+         res = newTemp(Ity_V128);
+         assign(tmp, binop(op, getQReg(nreg), getQReg(mreg)));
+         if (U) {
+            /* VRSUBHN */
+            assign(res, binop(op2, mkexpr(tmp),
+                     binop(Iop_64HLtoV128, mkU64(imm), mkU64(imm))));
+         } else {
+            assign(res, mkexpr(tmp));
+         }
+         putDRegI64(dreg, unop(cvt, binop(sh, mkexpr(res), mkU8(8 << size))),
+                    condT);
+         DIP("v%ssubhn.i%u d%u, q%u, q%u\n", U ? "r" : "", 16 << size, dreg,
+             nreg, mreg);
+         return True;
+      case 7:
+         /* VABDL */
+         if (!((theInstr >> 23) & 1)) {
+            vpanic("VABL should not be in dis_neon_data_3diff\n");
+         }
+         if (dreg & 1)
+            return False;
+         dreg >>= 1;
+         switch (size) {
+            case 0:
+               cmp = U ? Iop_CmpGT8Ux8 : Iop_CmpGT8Sx8;
+               cvt = U ? Iop_Longen8Ux8 : Iop_Longen8Sx8;
+               cvt2 = Iop_Longen8Sx8;
+               op = Iop_Sub16x8;
+               break;
+            case 1:
+               cmp = U ? Iop_CmpGT16Ux4 : Iop_CmpGT16Sx4;
+               cvt = U ? Iop_Longen16Ux4 : Iop_Longen16Sx4;
+               cvt2 = Iop_Longen16Sx4;
+               op = Iop_Sub32x4;
+               break;
+            case 2:
+               cmp = U ? Iop_CmpGT32Ux2 : Iop_CmpGT32Sx2;
+               cvt = U ? Iop_Longen32Ux2 : Iop_Longen32Sx2;
+               cvt2 = Iop_Longen32Sx2;
+               op = Iop_Sub64x2;
+               break;
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         arg_n = newTemp(Ity_V128);
+         arg_m = newTemp(Ity_V128);
+         cond = newTemp(Ity_V128);
+         res = newTemp(Ity_V128);
+         assign(arg_n, unop(cvt, getDRegI64(nreg)));
+         assign(arg_m, unop(cvt, getDRegI64(mreg)));
+         assign(cond, unop(cvt2, binop(cmp, getDRegI64(nreg),
+                                            getDRegI64(mreg))));
+         assign(res, binop(Iop_OrV128,
+                           binop(Iop_AndV128,
+                                 binop(op, mkexpr(arg_n), mkexpr(arg_m)),
+                                 mkexpr(cond)),
+                           binop(Iop_AndV128,
+                                 binop(op, mkexpr(arg_m), mkexpr(arg_n)),
+                                 unop(Iop_NotV128, mkexpr(cond)))));
+         putQReg(dreg, mkexpr(res), condT);
+         DIP("vabdl.%c%u q%u, d%u, d%u\n", U ? 'u' : 's', 8 << size, dreg,
+             nreg, mreg);
+         return True;
+      case 8:
+      case 10:
+         /* VMLAL, VMLSL (integer) */
+         if (dreg & 1)
+            return False;
+         dreg >>= 1;
+         size = B;
+         switch (size) {
+            case 0:
+               op = U ? Iop_Mull8Ux8 : Iop_Mull8Sx8;
+               op2 = P ? Iop_Sub16x8 : Iop_Add16x8;
+               break;
+            case 1:
+               op = U ? Iop_Mull16Ux4 : Iop_Mull16Sx4;
+               op2 = P ? Iop_Sub32x4 : Iop_Add32x4;
+               break;
+            case 2:
+               op = U ? Iop_Mull32Ux2 : Iop_Mull32Sx2;
+               op2 = P ? Iop_Sub64x2 : Iop_Add64x2;
+               break;
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         res = newTemp(Ity_V128);
+         assign(res, binop(op, getDRegI64(nreg),getDRegI64(mreg)));
+         putQReg(dreg, binop(op2, getQReg(dreg), mkexpr(res)), condT);
+         DIP("vml%cl.%c%u q%u, d%u, d%u\n", P ? 's' : 'a', U ? 'u' : 's',
+             8 << size, dreg, nreg, mreg);
+         return True;
+      case 9:
+      case 11:
+         /* VQDMLAL, VQDMLSL */
+         if (U)
+            return False;
+         if (dreg & 1)
+            return False;
+         dreg >>= 1;
+         size = B;
+         switch (size) {
+            case 0: case 3:
+               return False;
+            case 1:
+               op = Iop_QDMulLong16Sx4;
+               cmp = Iop_CmpEQ16x4;
+               add = P ? Iop_QSub32Sx4 : Iop_QAdd32Sx4;
+               op2 = P ? Iop_Sub32x4 : Iop_Add32x4;
+               imm = 1LL << 15;
+               imm = (imm << 16) | imm;
+               imm = (imm << 32) | imm;
+               break;
+            case 2:
+               op = Iop_QDMulLong32Sx2;
+               cmp = Iop_CmpEQ32x2;
+               add = P ? Iop_QSub64Sx2 : Iop_QAdd64Sx2;
+               op2 = P ? Iop_Sub64x2 : Iop_Add64x2;
+               imm = 1LL << 31;
+               imm = (imm << 32) | imm;
+               break;
+            default:
+               vassert(0);
+         }
+         res = newTemp(Ity_V128);
+         tmp = newTemp(Ity_V128);
+         assign(res, binop(op, getDRegI64(nreg), getDRegI64(mreg)));
+#ifndef DISABLE_QC_FLAG
+         assign(tmp, binop(op2, getQReg(dreg), mkexpr(res)));
+         setFlag_QC(mkexpr(tmp), binop(add, getQReg(dreg), mkexpr(res)),
+                    True, condT);
+         setFlag_QC(binop(Iop_And64,
+                          binop(cmp, getDRegI64(nreg), mkU64(imm)),
+                          binop(cmp, getDRegI64(mreg), mkU64(imm))),
+                    mkU64(0),
+                    False, condT);
+#endif
+         putQReg(dreg, binop(add, getQReg(dreg), mkexpr(res)), condT);
+         DIP("vqdml%cl.s%u q%u, d%u, d%u\n", P ? 's' : 'a', 8 << size, dreg,
+             nreg, mreg);
+         return True;
+      case 12:
+      case 14:
+         /* VMULL (integer or polynomial) */
+         if (dreg & 1)
+            return False;
+         dreg >>= 1;
+         size = B;
+         switch (size) {
+            case 0:
+               op = (U) ? Iop_Mull8Ux8 : Iop_Mull8Sx8;
+               if (P)
+                  op = Iop_PolynomialMull8x8;
+               break;
+            case 1:
+               op = (U) ? Iop_Mull16Ux4 : Iop_Mull16Sx4;
+               break;
+            case 2:
+               op = (U) ? Iop_Mull32Ux2 : Iop_Mull32Sx2;
+               break;
+            default:
+               vassert(0);
+         }
+         putQReg(dreg, binop(op, getDRegI64(nreg),
+                                 getDRegI64(mreg)), condT);
+         DIP("vmull.%c%u q%u, d%u, d%u\n", P ? 'p' : (U ? 'u' : 's'),
+               8 << size, dreg, nreg, mreg);
+         return True;
+      case 13:
+         /* VQDMULL */
+         if (U)
+            return False;
+         if (dreg & 1)
+            return False;
+         dreg >>= 1;
+         size = B;
+         switch (size) {
+            case 0:
+            case 3:
+               return False;
+            case 1:
+               op = Iop_QDMulLong16Sx4;
+               op2 = Iop_CmpEQ16x4;
+               imm = 1LL << 15;
+               imm = (imm << 16) | imm;
+               imm = (imm << 32) | imm;
+               break;
+            case 2:
+               op = Iop_QDMulLong32Sx2;
+               op2 = Iop_CmpEQ32x2;
+               imm = 1LL << 31;
+               imm = (imm << 32) | imm;
+               break;
+            default:
+               vassert(0);
+         }
+         putQReg(dreg, binop(op, getDRegI64(nreg), getDRegI64(mreg)),
+               condT);
+#ifndef DISABLE_QC_FLAG
+         setFlag_QC(binop(Iop_And64,
+                          binop(op2, getDRegI64(nreg), mkU64(imm)),
+                          binop(op2, getDRegI64(mreg), mkU64(imm))),
+                    mkU64(0),
+                    False, condT);
+#endif
+         DIP("vqdmull.s%u q%u, d%u, d%u\n", 8 << size, dreg, nreg, mreg);
+         return True;
+      default:
+         return False;
+   }
+   return False;
+}
+
+/* A7.4.3 Two registers and a scalar */
+static
+Bool dis_neon_data_2reg_and_scalar ( UInt theInstr, IRTemp condT )
+{
+#  define INSN(_bMax,_bMin)  SLICE_UInt(theInstr, (_bMax), (_bMin))
+   UInt U = INSN(24,24);
+   UInt dreg = get_neon_d_regno(theInstr & ~(1 << 6));
+   UInt nreg = get_neon_n_regno(theInstr & ~(1 << 6));
+   UInt mreg = get_neon_m_regno(theInstr & ~(1 << 6));
+   UInt size = INSN(21,20);
+   UInt index;
+   UInt Q = INSN(24,24);
+
+   if (INSN(27,25) != 1 || INSN(23,23) != 1
+       || INSN(6,6) != 1 || INSN(4,4) != 0)
+      return False;
+
+   /* VMLA, VMLS (scalar)  */
+   if ((INSN(11,8) & BITS4(1,0,1,0)) == BITS4(0,0,0,0)) {
+      IRTemp res, arg_m, arg_n;
+      IROp dup, get, op, op2, add, sub;
+      if (Q) {
+         if ((dreg & 1) || (nreg & 1))
+            return False;
+         dreg >>= 1;
+         nreg >>= 1;
+         res = newTemp(Ity_V128);
+         arg_m = newTemp(Ity_V128);
+         arg_n = newTemp(Ity_V128);
+         assign(arg_n, getQReg(nreg));
+         switch(size) {
+            case 1:
+               dup = Iop_Dup16x8;
+               get = Iop_GetElem16x4;
+               index = mreg >> 3;
+               mreg &= 7;
+               break;
+            case 2:
+               dup = Iop_Dup32x4;
+               get = Iop_GetElem32x2;
+               index = mreg >> 4;
+               mreg &= 0xf;
+               break;
+            case 0:
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         assign(arg_m, unop(dup, binop(get, getDRegI64(mreg), mkU8(index))));
+      } else {
+         res = newTemp(Ity_I64);
+         arg_m = newTemp(Ity_I64);
+         arg_n = newTemp(Ity_I64);
+         assign(arg_n, getDRegI64(nreg));
+         switch(size) {
+            case 1:
+               dup = Iop_Dup16x4;
+               get = Iop_GetElem16x4;
+               index = mreg >> 3;
+               mreg &= 7;
+               break;
+            case 2:
+               dup = Iop_Dup32x2;
+               get = Iop_GetElem32x2;
+               index = mreg >> 4;
+               mreg &= 0xf;
+               break;
+            case 0:
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         assign(arg_m, unop(dup, binop(get, getDRegI64(mreg), mkU8(index))));
+      }
+      if (INSN(8,8)) {
+         switch (size) {
+            case 2:
+               op = Q ? Iop_Mul32Fx4 : Iop_Mul32Fx2;
+               add = Q ? Iop_Add32Fx4 : Iop_Add32Fx2;
+               sub = Q ? Iop_Sub32Fx4 : Iop_Sub32Fx2;
+               break;
+            case 0:
+            case 1:
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+      } else {
+         switch (size) {
+            case 1:
+               op = Q ? Iop_Mul16x8 : Iop_Mul16x4;
+               add = Q ? Iop_Add16x8 : Iop_Add16x4;
+               sub = Q ? Iop_Sub16x8 : Iop_Sub16x4;
+               break;
+            case 2:
+               op = Q ? Iop_Mul32x4 : Iop_Mul32x2;
+               add = Q ? Iop_Add32x4 : Iop_Add32x2;
+               sub = Q ? Iop_Sub32x4 : Iop_Sub32x2;
+               break;
+            case 0:
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+      }
+      op2 = INSN(10,10) ? sub : add;
+      assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+      if (Q)
+         putQReg(dreg, binop(op2, getQReg(dreg), mkexpr(res)),
+               condT);
+      else
+         putDRegI64(dreg, binop(op2, getDRegI64(dreg), mkexpr(res)),
+                    condT);
+      DIP("vml%c.%c%u %c%u, %c%u, d%u[%u]\n", INSN(10,10) ? 's' : 'a',
+            INSN(8,8) ? 'f' : 'i', 8 << size,
+            Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', nreg, mreg, index);
+      return True;
+   }
+
+   /* VMLAL, VMLSL (scalar)   */
+   if ((INSN(11,8) & BITS4(1,0,1,1)) == BITS4(0,0,1,0)) {
+      IRTemp res, arg_m, arg_n;
+      IROp dup, get, op, op2, add, sub;
+      if (dreg & 1)
+         return False;
+      dreg >>= 1;
+      res = newTemp(Ity_V128);
+      arg_m = newTemp(Ity_I64);
+      arg_n = newTemp(Ity_I64);
+      assign(arg_n, getDRegI64(nreg));
+      switch(size) {
+         case 1:
+            dup = Iop_Dup16x4;
+            get = Iop_GetElem16x4;
+            index = mreg >> 3;
+            mreg &= 7;
+            break;
+         case 2:
+            dup = Iop_Dup32x2;
+            get = Iop_GetElem32x2;
+            index = mreg >> 4;
+            mreg &= 0xf;
+            break;
+         case 0:
+         case 3:
+            return False;
+         default:
+            vassert(0);
+      }
+      assign(arg_m, unop(dup, binop(get, getDRegI64(mreg), mkU8(index))));
+      switch (size) {
+         case 1:
+            op = U ? Iop_Mull16Ux4 : Iop_Mull16Sx4;
+            add = Iop_Add32x4;
+            sub = Iop_Sub32x4;
+            break;
+         case 2:
+            op = U ? Iop_Mull32Ux2 : Iop_Mull32Sx2;
+            add = Iop_Add64x2;
+            sub = Iop_Sub64x2;
+            break;
+         case 0:
+         case 3:
+            return False;
+         default:
+            vassert(0);
+      }
+      op2 = INSN(10,10) ? sub : add;
+      assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+      putQReg(dreg, binop(op2, getQReg(dreg), mkexpr(res)), condT);
+      DIP("vml%cl.%c%u q%u, d%u, d%u[%u]\n",
+          INSN(10,10) ? 's' : 'a', U ? 'u' : 's',
+          8 << size, dreg, nreg, mreg, index);
+      return True;
+   }
+
+   /* VQDMLAL, VQDMLSL (scalar)  */
+   if ((INSN(11,8) & BITS4(1,0,1,1)) == BITS4(0,0,1,1) && !U) {
+      IRTemp res, arg_m, arg_n, tmp;
+      IROp dup, get, op, op2, add, cmp;
+      UInt P = INSN(10,10);
+      ULong imm;
+      if (dreg & 1)
+         return False;
+      dreg >>= 1;
+      res = newTemp(Ity_V128);
+      arg_m = newTemp(Ity_I64);
+      arg_n = newTemp(Ity_I64);
+      assign(arg_n, getDRegI64(nreg));
+      switch(size) {
+         case 1:
+            dup = Iop_Dup16x4;
+            get = Iop_GetElem16x4;
+            index = mreg >> 3;
+            mreg &= 7;
+            break;
+         case 2:
+            dup = Iop_Dup32x2;
+            get = Iop_GetElem32x2;
+            index = mreg >> 4;
+            mreg &= 0xf;
+            break;
+         case 0:
+         case 3:
+            return False;
+         default:
+            vassert(0);
+      }
+      assign(arg_m, unop(dup, binop(get, getDRegI64(mreg), mkU8(index))));
+      switch (size) {
+         case 0:
+         case 3:
+            return False;
+         case 1:
+            op = Iop_QDMulLong16Sx4;
+            cmp = Iop_CmpEQ16x4;
+            add = P ? Iop_QSub32Sx4 : Iop_QAdd32Sx4;
+            op2 = P ? Iop_Sub32x4 : Iop_Add32x4;
+            imm = 1LL << 15;
+            imm = (imm << 16) | imm;
+            imm = (imm << 32) | imm;
+            break;
+         case 2:
+            op = Iop_QDMulLong32Sx2;
+            cmp = Iop_CmpEQ32x2;
+            add = P ? Iop_QSub64Sx2 : Iop_QAdd64Sx2;
+            op2 = P ? Iop_Sub64x2 : Iop_Add64x2;
+            imm = 1LL << 31;
+            imm = (imm << 32) | imm;
+            break;
+         default:
+            vassert(0);
+      }
+      res = newTemp(Ity_V128);
+      tmp = newTemp(Ity_V128);
+      assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+#ifndef DISABLE_QC_FLAG
+      assign(tmp, binop(op2, getQReg(dreg), mkexpr(res)));
+      setFlag_QC(binop(Iop_And64,
+                       binop(cmp, mkexpr(arg_n), mkU64(imm)),
+                       binop(cmp, mkexpr(arg_m), mkU64(imm))),
+                 mkU64(0),
+                 False, condT);
+      setFlag_QC(mkexpr(tmp), binop(add, getQReg(dreg), mkexpr(res)),
+                 True, condT);
+#endif
+      putQReg(dreg, binop(add, getQReg(dreg), mkexpr(res)), condT);
+      DIP("vqdml%cl.s%u q%u, d%u, d%u[%u]\n", P ? 's' : 'a', 8 << size,
+          dreg, nreg, mreg, index);
+      return True;
+   }
+
+   /* VMUL (by scalar)  */
+   if ((INSN(11,8) & BITS4(1,1,1,0)) == BITS4(1,0,0,0)) {
+      IRTemp res, arg_m, arg_n;
+      IROp dup, get, op;
+      if (Q) {
+         if ((dreg & 1) || (nreg & 1))
+            return False;
+         dreg >>= 1;
+         nreg >>= 1;
+         res = newTemp(Ity_V128);
+         arg_m = newTemp(Ity_V128);
+         arg_n = newTemp(Ity_V128);
+         assign(arg_n, getQReg(nreg));
+         switch(size) {
+            case 1:
+               dup = Iop_Dup16x8;
+               get = Iop_GetElem16x4;
+               index = mreg >> 3;
+               mreg &= 7;
+               break;
+            case 2:
+               dup = Iop_Dup32x4;
+               get = Iop_GetElem32x2;
+               index = mreg >> 4;
+               mreg &= 0xf;
+               break;
+            case 0:
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         assign(arg_m, unop(dup, binop(get, getDRegI64(mreg), mkU8(index))));
+      } else {
+         res = newTemp(Ity_I64);
+         arg_m = newTemp(Ity_I64);
+         arg_n = newTemp(Ity_I64);
+         assign(arg_n, getDRegI64(nreg));
+         switch(size) {
+            case 1:
+               dup = Iop_Dup16x4;
+               get = Iop_GetElem16x4;
+               index = mreg >> 3;
+               mreg &= 7;
+               break;
+            case 2:
+               dup = Iop_Dup32x2;
+               get = Iop_GetElem32x2;
+               index = mreg >> 4;
+               mreg &= 0xf;
+               break;
+            case 0:
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         assign(arg_m, unop(dup, binop(get, getDRegI64(mreg), mkU8(index))));
+      }
+      switch (size) {
+         case 1:
+            op = Q ? Iop_Mul16x8 : Iop_Mul16x4;
+            break;
+         case 2:
+            op = Q ? Iop_Mul32x4 : Iop_Mul32x2;
+            break;
+         case 0:
+         case 3:
+            return False;
+         default:
+            vassert(0);
+      }
+      assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+      if (Q)
+         putQReg(dreg, mkexpr(res), condT);
+      else
+         putDRegI64(dreg, mkexpr(res), condT);
+      DIP("vmul.i%u %c%u, %c%u, d%u[%u]\n", 8 << size, Q ? 'q' : 'd', dreg,
+          Q ? 'q' : 'd', nreg, mreg, index);
+      return True;
+   }
+
+   /* VMULL (scalar) */
+   if (INSN(11,8) == BITS4(1,0,1,0)) {
+      IRTemp res, arg_m, arg_n;
+      IROp dup, get, op;
+      if (dreg & 1)
+         return False;
+      dreg >>= 1;
+      res = newTemp(Ity_V128);
+      arg_m = newTemp(Ity_I64);
+      arg_n = newTemp(Ity_I64);
+      assign(arg_n, getDRegI64(nreg));
+      switch(size) {
+         case 1:
+            dup = Iop_Dup16x4;
+            get = Iop_GetElem16x4;
+            index = mreg >> 3;
+            mreg &= 7;
+            break;
+         case 2:
+            dup = Iop_Dup32x2;
+            get = Iop_GetElem32x2;
+            index = mreg >> 4;
+            mreg &= 0xf;
+            break;
+         case 0:
+         case 3:
+            return False;
+         default:
+            vassert(0);
+      }
+      assign(arg_m, unop(dup, binop(get, getDRegI64(mreg), mkU8(index))));
+      switch (size) {
+         case 1: op = U ? Iop_Mull16Ux4 : Iop_Mull16Sx4; break;
+         case 2: op = U ? Iop_Mull32Ux2 : Iop_Mull32Sx2; break;
+         case 0: case 3: return False;
+         default: vassert(0);
+      }
+      assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+      putQReg(dreg, mkexpr(res), condT);
+      DIP("vmull.%c%u q%u, d%u, d%u[%u]\n", U ? 'u' : 's', 8 << size, dreg,
+          nreg, mreg, index);
+      return True;
+   }
+
+   /* VQDMULL */
+   if (INSN(11,8) == BITS4(1,0,1,1) && !U) {
+      IROp op ,op2, dup, get;
+      ULong imm;
+      IRTemp res, arg_m, arg_n;
+      if (dreg & 1)
+         return False;
+      dreg >>= 1;
+      res = newTemp(Ity_V128);
+      arg_m = newTemp(Ity_I64);
+      arg_n = newTemp(Ity_I64);
+      assign(arg_n, getDRegI64(nreg));
+      switch(size) {
+         case 1:
+            dup = Iop_Dup16x4;
+            get = Iop_GetElem16x4;
+            index = mreg >> 3;
+            mreg &= 7;
+            break;
+         case 2:
+            dup = Iop_Dup32x2;
+            get = Iop_GetElem32x2;
+            index = mreg >> 4;
+            mreg &= 0xf;
+            break;
+         case 0:
+         case 3:
+            return False;
+         default:
+            vassert(0);
+      }
+      assign(arg_m, unop(dup, binop(get, getDRegI64(mreg), mkU8(index))));
+      switch (size) {
+         case 0:
+         case 3:
+            return False;
+         case 1:
+            op = Iop_QDMulLong16Sx4;
+            op2 = Iop_CmpEQ16x4;
+            imm = 1LL << 15;
+            imm = (imm << 16) | imm;
+            imm = (imm << 32) | imm;
+            break;
+         case 2:
+            op = Iop_QDMulLong32Sx2;
+            op2 = Iop_CmpEQ32x2;
+            imm = 1LL << 31;
+            imm = (imm << 32) | imm;
+            break;
+         default:
+            vassert(0);
+      }
+      putQReg(dreg, binop(op, mkexpr(arg_n), mkexpr(arg_m)),
+            condT);
+#ifndef DISABLE_QC_FLAG
+      setFlag_QC(binop(Iop_And64,
+                       binop(op2, mkexpr(arg_n), mkU64(imm)),
+                       binop(op2, mkexpr(arg_m), mkU64(imm))),
+                 mkU64(0),
+                 False, condT);
+#endif
+      DIP("vqdmull.s%u q%u, d%u, d%u[%u]\n", 8 << size, dreg, nreg, mreg,
+          index);
+      return True;
+   }
+
+   /* VQDMULH */
+   if (INSN(11,8) == BITS4(1,1,0,0)) {
+      IROp op ,op2, dup, get;
+      ULong imm;
+      IRTemp res, arg_m, arg_n;
+      if (Q) {
+         if ((dreg & 1) || (nreg & 1))
+            return False;
+         dreg >>= 1;
+         nreg >>= 1;
+         res = newTemp(Ity_V128);
+         arg_m = newTemp(Ity_V128);
+         arg_n = newTemp(Ity_V128);
+         assign(arg_n, getQReg(nreg));
+         switch(size) {
+            case 1:
+               dup = Iop_Dup16x8;
+               get = Iop_GetElem16x4;
+               index = mreg >> 3;
+               mreg &= 7;
+               break;
+            case 2:
+               dup = Iop_Dup32x4;
+               get = Iop_GetElem32x2;
+               index = mreg >> 4;
+               mreg &= 0xf;
+               break;
+            case 0:
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         assign(arg_m, unop(dup, binop(get, getDRegI64(mreg), mkU8(index))));
+      } else {
+         res = newTemp(Ity_I64);
+         arg_m = newTemp(Ity_I64);
+         arg_n = newTemp(Ity_I64);
+         assign(arg_n, getDRegI64(nreg));
+         switch(size) {
+            case 1:
+               dup = Iop_Dup16x4;
+               get = Iop_GetElem16x4;
+               index = mreg >> 3;
+               mreg &= 7;
+               break;
+            case 2:
+               dup = Iop_Dup32x2;
+               get = Iop_GetElem32x2;
+               index = mreg >> 4;
+               mreg &= 0xf;
+               break;
+            case 0:
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         assign(arg_m, unop(dup, binop(get, getDRegI64(mreg), mkU8(index))));
+      }
+      switch (size) {
+         case 0:
+         case 3:
+            return False;
+         case 1:
+            op = Q ? Iop_QDMulHi16Sx8 : Iop_QDMulHi16Sx4;
+            op2 = Q ? Iop_CmpEQ16x8 : Iop_CmpEQ16x4;
+            imm = 1LL << 15;
+            imm = (imm << 16) | imm;
+            imm = (imm << 32) | imm;
+            break;
+         case 2:
+            op = Q ? Iop_QDMulHi32Sx4 : Iop_QDMulHi32Sx2;
+            op2 = Q ? Iop_CmpEQ32x4 : Iop_CmpEQ32x2;
+            imm = 1LL << 31;
+            imm = (imm << 32) | imm;
+            break;
+         default:
+            vassert(0);
+      }
+      assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+#ifndef DISABLE_QC_FLAG
+      setFlag_QC(binop(Q ? Iop_AndV128 : Iop_And64,
+                       binop(op2, mkexpr(arg_n),
+                                  Q ? mkU128(imm) : mkU64(imm)),
+                       binop(op2, mkexpr(arg_m),
+                             Q ? mkU128(imm) : mkU64(imm))),
+                 Q ? mkU128(0) : mkU64(0),
+                 Q, condT);
+#endif
+      if (Q)
+         putQReg(dreg, mkexpr(res), condT);
+      else
+         putDRegI64(dreg, mkexpr(res), condT);
+      DIP("vqdmulh.s%u %c%u, %c%u, d%u[%u]\n",
+          8 << size, Q ? 'q' : 'd', dreg,
+          Q ? 'q' : 'd', nreg, mreg, index);
+      return True;
+   }
+
+   /* VQRDMULH (scalar) */
+   if (INSN(11,8) == BITS4(1,1,0,1)) {
+      IROp op ,op2, dup, get;
+      ULong imm;
+      IRTemp res, arg_m, arg_n;
+      if (Q) {
+         if ((dreg & 1) || (nreg & 1))
+            return False;
+         dreg >>= 1;
+         nreg >>= 1;
+         res = newTemp(Ity_V128);
+         arg_m = newTemp(Ity_V128);
+         arg_n = newTemp(Ity_V128);
+         assign(arg_n, getQReg(nreg));
+         switch(size) {
+            case 1:
+               dup = Iop_Dup16x8;
+               get = Iop_GetElem16x4;
+               index = mreg >> 3;
+               mreg &= 7;
+               break;
+            case 2:
+               dup = Iop_Dup32x4;
+               get = Iop_GetElem32x2;
+               index = mreg >> 4;
+               mreg &= 0xf;
+               break;
+            case 0:
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         assign(arg_m, unop(dup, binop(get, getDRegI64(mreg), mkU8(index))));
+      } else {
+         res = newTemp(Ity_I64);
+         arg_m = newTemp(Ity_I64);
+         arg_n = newTemp(Ity_I64);
+         assign(arg_n, getDRegI64(nreg));
+         switch(size) {
+            case 1:
+               dup = Iop_Dup16x4;
+               get = Iop_GetElem16x4;
+               index = mreg >> 3;
+               mreg &= 7;
+               break;
+            case 2:
+               dup = Iop_Dup32x2;
+               get = Iop_GetElem32x2;
+               index = mreg >> 4;
+               mreg &= 0xf;
+               break;
+            case 0:
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         assign(arg_m, unop(dup, binop(get, getDRegI64(mreg), mkU8(index))));
+      }
+      switch (size) {
+         case 0:
+         case 3:
+            return False;
+         case 1:
+            op = Q ? Iop_QRDMulHi16Sx8 : Iop_QRDMulHi16Sx4;
+            op2 = Q ? Iop_CmpEQ16x8 : Iop_CmpEQ16x4;
+            imm = 1LL << 15;
+            imm = (imm << 16) | imm;
+            imm = (imm << 32) | imm;
+            break;
+         case 2:
+            op = Q ? Iop_QRDMulHi32Sx4 : Iop_QRDMulHi32Sx2;
+            op2 = Q ? Iop_CmpEQ32x4 : Iop_CmpEQ32x2;
+            imm = 1LL << 31;
+            imm = (imm << 32) | imm;
+            break;
+         default:
+            vassert(0);
+      }
+      assign(res, binop(op, mkexpr(arg_n), mkexpr(arg_m)));
+#ifndef DISABLE_QC_FLAG
+      setFlag_QC(binop(Q ? Iop_AndV128 : Iop_And64,
+                       binop(op2, mkexpr(arg_n),
+                                  Q ? mkU128(imm) : mkU64(imm)),
+                       binop(op2, mkexpr(arg_m),
+                                  Q ? mkU128(imm) : mkU64(imm))),
+                 Q ? mkU128(0) : mkU64(0),
+                 Q, condT);
+#endif
+      if (Q)
+         putQReg(dreg, mkexpr(res), condT);
+      else
+         putDRegI64(dreg, mkexpr(res), condT);
+      DIP("vqrdmulh.s%u %c%u, %c%u, d%u[%u]\n",
+          8 << size, Q ? 'q' : 'd', dreg,
+          Q ? 'q' : 'd', nreg, mreg, index);
+      return True;
+   }
+
+   return False;
+#  undef INSN
+}
+
+/* A7.4.4 Two registers and a shift amount */
+static
+Bool dis_neon_data_2reg_and_shift ( UInt theInstr, IRTemp condT )
+{
+   UInt A = (theInstr >> 8) & 0xf;
+   UInt B = (theInstr >> 6) & 1;
+   UInt L = (theInstr >> 7) & 1;
+   UInt U = (theInstr >> 24) & 1;
+   UInt Q = B;
+   UInt imm6 = (theInstr >> 16) & 0x3f;
+   UInt shift_imm;
+   UInt size = 4;
+   UInt tmp;
+   UInt mreg = get_neon_m_regno(theInstr);
+   UInt dreg = get_neon_d_regno(theInstr);
+   ULong imm = 0;
+   IROp op, cvt, add = Iop_INVALID, cvt2, op_rev;
+   IRTemp reg_m, res, mask;
+
+   if (L == 0 && ((theInstr >> 19) & 7) == 0)
+      /* It is one reg and immediate */
+      return False;
+
+   tmp = (L << 6) | imm6;
+   if (tmp & 0x40) {
+      size = 3;
+      shift_imm = 64 - imm6;
+   } else if (tmp & 0x20) {
+      size = 2;
+      shift_imm = 64 - imm6;
+   } else if (tmp & 0x10) {
+      size = 1;
+      shift_imm = 32 - imm6;
+   } else if (tmp & 0x8) {
+      size = 0;
+      shift_imm = 16 - imm6;
+   } else {
+      return False;
+   }
+
+   switch (A) {
+      case 3:
+      case 2:
+         /* VRSHR, VRSRA */
+         if (shift_imm > 0) {
+            IRExpr *imm_val;
+            imm = 1L;
+            switch (size) {
+               case 0:
+                  imm = (imm << 8) | imm;
+                  /* fall through */
+               case 1:
+                  imm = (imm << 16) | imm;
+                  /* fall through */
+               case 2:
+                  imm = (imm << 32) | imm;
+                  /* fall through */
+               case 3:
+                  break;
+               default:
+                  vassert(0);
+            }
+            if (Q) {
+               reg_m = newTemp(Ity_V128);
+               res = newTemp(Ity_V128);
+               imm_val = binop(Iop_64HLtoV128, mkU64(imm), mkU64(imm));
+               assign(reg_m, getQReg(mreg));
+               switch (size) {
+                  case 0:
+                     add = Iop_Add8x16;
+                     op = U ? Iop_ShrN8x16 : Iop_SarN8x16;
+                     break;
+                  case 1:
+                     add = Iop_Add16x8;
+                     op = U ? Iop_ShrN16x8 : Iop_SarN16x8;
+                     break;
+                  case 2:
+                     add = Iop_Add32x4;
+                     op = U ? Iop_ShrN32x4 : Iop_SarN32x4;
+                     break;
+                  case 3:
+                     add = Iop_Add64x2;
+                     op = U ? Iop_ShrN64x2 : Iop_SarN64x2;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               reg_m = newTemp(Ity_I64);
+               res = newTemp(Ity_I64);
+               imm_val = mkU64(imm);
+               assign(reg_m, getDRegI64(mreg));
+               switch (size) {
+                  case 0:
+                     add = Iop_Add8x8;
+                     op = U ? Iop_ShrN8x8 : Iop_SarN8x8;
+                     break;
+                  case 1:
+                     add = Iop_Add16x4;
+                     op = U ? Iop_ShrN16x4 : Iop_SarN16x4;
+                     break;
+                  case 2:
+                     add = Iop_Add32x2;
+                     op = U ? Iop_ShrN32x2 : Iop_SarN32x2;
+                     break;
+                  case 3:
+                     add = Iop_Add64;
+                     op = U ? Iop_Shr64 : Iop_Sar64;
+                     break;
+                  default:
+                     vassert(0);
+               }
+            }
+            assign(res,
+                   binop(add,
+                         binop(op,
+                               mkexpr(reg_m),
+                               mkU8(shift_imm)),
+                         binop(Q ? Iop_AndV128 : Iop_And64,
+                               binop(op,
+                                     mkexpr(reg_m),
+                                     mkU8(shift_imm - 1)),
+                               imm_val)));
+         } else {
+            if (Q) {
+               res = newTemp(Ity_V128);
+               assign(res, getQReg(mreg));
+            } else {
+               res = newTemp(Ity_I64);
+               assign(res, getDRegI64(mreg));
+            }
+         }
+         if (A == 3) {
+            if (Q) {
+               putQReg(dreg, binop(add, mkexpr(res), getQReg(dreg)),
+                             condT);
+            } else {
+               putDRegI64(dreg, binop(add, mkexpr(res), getDRegI64(dreg)),
+                                condT);
+            }
+            DIP("vrsra.%c%u %c%u, %c%u, #%u\n",
+                U ? 'u' : 's', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg, shift_imm);
+         } else {
+            if (Q) {
+               putQReg(dreg, mkexpr(res), condT);
+            } else {
+               putDRegI64(dreg, mkexpr(res), condT);
+            }
+            DIP("vrshr.%c%u %c%u, %c%u, #%u\n", U ? 'u' : 's', 8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg, shift_imm);
+         }
+         return True;
+      case 1:
+      case 0:
+         /* VSHR, VSRA */
+         if (Q) {
+            reg_m = newTemp(Ity_V128);
+            assign(reg_m, getQReg(mreg));
+            res = newTemp(Ity_V128);
+         } else {
+            reg_m = newTemp(Ity_I64);
+            assign(reg_m, getDRegI64(mreg));
+            res = newTemp(Ity_I64);
+         }
+         if (Q) {
+            switch (size) {
+               case 0:
+                  op = U ? Iop_ShrN8x16 : Iop_SarN8x16;
+                  add = Iop_Add8x16;
+                  break;
+               case 1:
+                  op = U ? Iop_ShrN16x8 : Iop_SarN16x8;
+                  add = Iop_Add16x8;
+                  break;
+               case 2:
+                  op = U ? Iop_ShrN32x4 : Iop_SarN32x4;
+                  add = Iop_Add32x4;
+                  break;
+               case 3:
+                  op = U ? Iop_ShrN64x2 : Iop_SarN64x2;
+                  add = Iop_Add64x2;
+                  break;
+               default:
+                  vassert(0);
+            }
+         } else {
+            switch (size) {
+               case 0:
+                  op =  U ? Iop_ShrN8x8 : Iop_SarN8x8;
+                  add = Iop_Add8x8;
+                  break;
+               case 1:
+                  op = U ? Iop_ShrN16x4 : Iop_SarN16x4;
+                  add = Iop_Add16x4;
+                  break;
+               case 2:
+                  op = U ? Iop_ShrN32x2 : Iop_SarN32x2;
+                  add = Iop_Add32x2;
+                  break;
+               case 3:
+                  op = U ? Iop_Shr64 : Iop_Sar64;
+                  add = Iop_Add64;
+                  break;
+               default:
+                  vassert(0);
+            }
+         }
+         assign(res, binop(op, mkexpr(reg_m), mkU8(shift_imm)));
+         if (A == 1) {
+            if (Q) {
+               putQReg(dreg, binop(add, mkexpr(res), getQReg(dreg)),
+                             condT);
+            } else {
+               putDRegI64(dreg, binop(add, mkexpr(res), getDRegI64(dreg)),
+                                condT);
+            }
+            DIP("vsra.%c%u %c%u, %c%u, #%u\n", U ? 'u' : 's', 8 << size,
+                  Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg, shift_imm);
+         } else {
+            if (Q) {
+               putQReg(dreg, mkexpr(res), condT);
+            } else {
+               putDRegI64(dreg, mkexpr(res), condT);
+            }
+            DIP("vshr.%c%u %c%u, %c%u, #%u\n", U ? 'u' : 's', 8 << size,
+                  Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg, shift_imm);
+         }
+         return True;
+      case 4:
+         /* VSRI */
+         if (!U)
+            return False;
+         if (Q) {
+            res = newTemp(Ity_V128);
+            mask = newTemp(Ity_V128);
+         } else {
+            res = newTemp(Ity_I64);
+            mask = newTemp(Ity_I64);
+         }
+         switch (size) {
+            case 0: op = Q ? Iop_ShrN8x16 : Iop_ShrN8x8; break;
+            case 1: op = Q ? Iop_ShrN16x8 : Iop_ShrN16x4; break;
+            case 2: op = Q ? Iop_ShrN32x4 : Iop_ShrN32x2; break;
+            case 3: op = Q ? Iop_ShrN64x2 : Iop_Shr64; break;
+            default: vassert(0);
+         }
+         if (Q) {
+            assign(mask, binop(op, binop(Iop_64HLtoV128,
+                                         mkU64(0xFFFFFFFFFFFFFFFFLL),
+                                         mkU64(0xFFFFFFFFFFFFFFFFLL)),
+                               mkU8(shift_imm)));
+            assign(res, binop(Iop_OrV128,
+                              binop(Iop_AndV128,
+                                    getQReg(dreg),
+                                    unop(Iop_NotV128,
+                                         mkexpr(mask))),
+                              binop(op,
+                                    getQReg(mreg),
+                                    mkU8(shift_imm))));
+            putQReg(dreg, mkexpr(res), condT);
+         } else {
+            assign(mask, binop(op, mkU64(0xFFFFFFFFFFFFFFFFLL),
+                               mkU8(shift_imm)));
+            assign(res, binop(Iop_Or64,
+                              binop(Iop_And64,
+                                    getDRegI64(dreg),
+                                    unop(Iop_Not64,
+                                         mkexpr(mask))),
+                              binop(op,
+                                    getDRegI64(mreg),
+                                    mkU8(shift_imm))));
+            putDRegI64(dreg, mkexpr(res), condT);
+         }
+         DIP("vsri.%u %c%u, %c%u, #%u\n",
+             8 << size, Q ? 'q' : 'd', dreg,
+             Q ? 'q' : 'd', mreg, shift_imm);
+         return True;
+      case 5:
+         if (U) {
+            /* VSLI */
+            shift_imm = 8 * (1 << size) - shift_imm;
+            if (Q) {
+               res = newTemp(Ity_V128);
+               mask = newTemp(Ity_V128);
+            } else {
+               res = newTemp(Ity_I64);
+               mask = newTemp(Ity_I64);
+            }
+            switch (size) {
+               case 0: op = Q ? Iop_ShlN8x16 : Iop_ShlN8x8; break;
+               case 1: op = Q ? Iop_ShlN16x8 : Iop_ShlN16x4; break;
+               case 2: op = Q ? Iop_ShlN32x4 : Iop_ShlN32x2; break;
+               case 3: op = Q ? Iop_ShlN64x2 : Iop_Shl64; break;
+               default: vassert(0);
+            }
+            if (Q) {
+               assign(mask, binop(op, binop(Iop_64HLtoV128,
+                                            mkU64(0xFFFFFFFFFFFFFFFFLL),
+                                            mkU64(0xFFFFFFFFFFFFFFFFLL)),
+                                  mkU8(shift_imm)));
+               assign(res, binop(Iop_OrV128,
+                                 binop(Iop_AndV128,
+                                       getQReg(dreg),
+                                       unop(Iop_NotV128,
+                                            mkexpr(mask))),
+                                 binop(op,
+                                       getQReg(mreg),
+                                       mkU8(shift_imm))));
+               putQReg(dreg, mkexpr(res), condT);
+            } else {
+               assign(mask, binop(op, mkU64(0xFFFFFFFFFFFFFFFFLL),
+                                  mkU8(shift_imm)));
+               assign(res, binop(Iop_Or64,
+                                 binop(Iop_And64,
+                                       getDRegI64(dreg),
+                                       unop(Iop_Not64,
+                                            mkexpr(mask))),
+                                 binop(op,
+                                       getDRegI64(mreg),
+                                       mkU8(shift_imm))));
+               putDRegI64(dreg, mkexpr(res), condT);
+            }
+            DIP("vsli.%u %c%u, %c%u, #%u\n",
+                8 << size, Q ? 'q' : 'd', dreg,
+                Q ? 'q' : 'd', mreg, shift_imm);
+            return True;
+         } else {
+            /* VSHL #imm */
+            shift_imm = 8 * (1 << size) - shift_imm;
+            if (Q) {
+               res = newTemp(Ity_V128);
+            } else {
+               res = newTemp(Ity_I64);
+            }
+            switch (size) {
+               case 0: op = Q ? Iop_ShlN8x16 : Iop_ShlN8x8; break;
+               case 1: op = Q ? Iop_ShlN16x8 : Iop_ShlN16x4; break;
+               case 2: op = Q ? Iop_ShlN32x4 : Iop_ShlN32x2; break;
+               case 3: op = Q ? Iop_ShlN64x2 : Iop_Shl64; break;
+               default: vassert(0);
+            }
+            assign(res, binop(op, Q ? getQReg(mreg) : getDRegI64(mreg),
+                     mkU8(shift_imm)));
+            if (Q) {
+               putQReg(dreg, mkexpr(res), condT);
+            } else {
+               putDRegI64(dreg, mkexpr(res), condT);
+            }
+            DIP("vshl.i%u %c%u, %c%u, #%u\n",
+                8 << size, Q ? 'q' : 'd', dreg,
+                Q ? 'q' : 'd', mreg, shift_imm);
+            return True;
+         }
+         break;
+      case 6:
+      case 7:
+         /* VQSHL, VQSHLU */
+         shift_imm = 8 * (1 << size) - shift_imm;
+         if (U) {
+            if (A & 1) {
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_QShlN8x16 : Iop_QShlN8x8;
+                     op_rev = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_QShlN16x8 : Iop_QShlN16x4;
+                     op_rev = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_QShlN32x4 : Iop_QShlN32x2;
+                     op_rev = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
+                     break;
+                  case 3:
+                     op = Q ? Iop_QShlN64x2 : Iop_QShlN64x1;
+                     op_rev = Q ? Iop_ShrN64x2 : Iop_Shr64;
+                     break;
+                  default:
+                     vassert(0);
+               }
+               DIP("vqshl.u%u %c%u, %c%u, #%u\n",
+                   8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg, shift_imm);
+            } else {
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_QShlN8Sx16 : Iop_QShlN8Sx8;
+                     op_rev = Q ? Iop_ShrN8x16 : Iop_ShrN8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_QShlN16Sx8 : Iop_QShlN16Sx4;
+                     op_rev = Q ? Iop_ShrN16x8 : Iop_ShrN16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_QShlN32Sx4 : Iop_QShlN32Sx2;
+                     op_rev = Q ? Iop_ShrN32x4 : Iop_ShrN32x2;
+                     break;
+                  case 3:
+                     op = Q ? Iop_QShlN64Sx2 : Iop_QShlN64Sx1;
+                     op_rev = Q ? Iop_ShrN64x2 : Iop_Shr64;
+                     break;
+                  default:
+                     vassert(0);
+               }
+               DIP("vqshlu.s%u %c%u, %c%u, #%u\n",
+                   8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg, shift_imm);
+            }
+         } else {
+            if (!(A & 1))
+               return False;
+            switch (size) {
+               case 0:
+                  op = Q ? Iop_QSalN8x16 : Iop_QSalN8x8;
+                  op_rev = Q ? Iop_SarN8x16 : Iop_SarN8x8;
+                  break;
+               case 1:
+                  op = Q ? Iop_QSalN16x8 : Iop_QSalN16x4;
+                  op_rev = Q ? Iop_SarN16x8 : Iop_SarN16x4;
+                  break;
+               case 2:
+                  op = Q ? Iop_QSalN32x4 : Iop_QSalN32x2;
+                  op_rev = Q ? Iop_SarN32x4 : Iop_SarN32x2;
+                  break;
+               case 3:
+                  op = Q ? Iop_QSalN64x2 : Iop_QSalN64x1;
+                  op_rev = Q ? Iop_SarN64x2 : Iop_Sar64;
+                  break;
+               default:
+                  vassert(0);
+            }
+            DIP("vqshl.s%u %c%u, %c%u, #%u\n",
+                8 << size,
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg, shift_imm);
+         }
+         if (Q) {
+            tmp = newTemp(Ity_V128);
+            res = newTemp(Ity_V128);
+            reg_m = newTemp(Ity_V128);
+            assign(reg_m, getQReg(mreg));
+         } else {
+            tmp = newTemp(Ity_I64);
+            res = newTemp(Ity_I64);
+            reg_m = newTemp(Ity_I64);
+            assign(reg_m, getDRegI64(mreg));
+         }
+         assign(res, binop(op, mkexpr(reg_m), mkU8(shift_imm)));
+#ifndef DISABLE_QC_FLAG
+         assign(tmp, binop(op_rev, mkexpr(res), mkU8(shift_imm)));
+         setFlag_QC(mkexpr(tmp), mkexpr(reg_m), Q, condT);
+#endif
+         if (Q)
+            putQReg(dreg, mkexpr(res), condT);
+         else
+            putDRegI64(dreg, mkexpr(res), condT);
+         return True;
+      case 8:
+         if (!U) {
+            if (L == 1)
+               return False;
+            size++;
+            dreg = ((theInstr >> 18) & 0x10) | ((theInstr >> 12) & 0xF);
+            mreg = ((theInstr >> 1) & 0x10) | (theInstr & 0xF);
+            if (mreg & 1)
+               return False;
+            mreg >>= 1;
+            if (!B) {
+               /* VSHRN*/
+               IROp narOp;
+               reg_m = newTemp(Ity_V128);
+               assign(reg_m, getQReg(mreg));
+               res = newTemp(Ity_I64);
+               switch (size) {
+                  case 1:
+                     op = Iop_ShrN16x8;
+                     narOp = Iop_Shorten16x8;
+                     break;
+                  case 2:
+                     op = Iop_ShrN32x4;
+                     narOp = Iop_Shorten32x4;
+                     break;
+                  case 3:
+                     op = Iop_ShrN64x2;
+                     narOp = Iop_Shorten64x2;
+                     break;
+                  default:
+                     vassert(0);
+               }
+               assign(res, unop(narOp,
+                                binop(op,
+                                      mkexpr(reg_m),
+                                      mkU8(shift_imm))));
+               putDRegI64(dreg, mkexpr(res), condT);
+               DIP("vshrn.i%u d%u, q%u, #%u\n", 8 << size, dreg, mreg,
+                   shift_imm);
+               return True;
+            } else {
+               /* VRSHRN   */
+               IROp addOp, shOp, narOp;
+               IRExpr *imm_val;
+               reg_m = newTemp(Ity_V128);
+               assign(reg_m, getQReg(mreg));
+               res = newTemp(Ity_I64);
+               imm = 1L;
+               switch (size) {
+                  case 0: imm = (imm <<  8) | imm; /* fall through */
+                  case 1: imm = (imm << 16) | imm; /* fall through */
+                  case 2: imm = (imm << 32) | imm; /* fall through */
+                  case 3: break;
+                  default: vassert(0);
+               }
+               imm_val = binop(Iop_64HLtoV128, mkU64(imm), mkU64(imm));
+               switch (size) {
+                  case 1:
+                     addOp = Iop_Add16x8;
+                     shOp = Iop_ShrN16x8;
+                     narOp = Iop_Shorten16x8;
+                     break;
+                  case 2:
+                     addOp = Iop_Add32x4;
+                     shOp = Iop_ShrN32x4;
+                     narOp = Iop_Shorten32x4;
+                     break;
+                  case 3:
+                     addOp = Iop_Add64x2;
+                     shOp = Iop_ShrN64x2;
+                     narOp = Iop_Shorten64x2;
+                     break;
+                  default:
+                     vassert(0);
+               }
+               assign(res, unop(narOp,
+                                binop(addOp,
+                                      binop(shOp,
+                                            mkexpr(reg_m),
+                                            mkU8(shift_imm)),
+                                      binop(Iop_AndV128,
+                                            binop(shOp,
+                                                  mkexpr(reg_m),
+                                                  mkU8(shift_imm - 1)),
+                                            imm_val))));
+               putDRegI64(dreg, mkexpr(res), condT);
+               if (shift_imm == 0) {
+                  DIP("vmov%u d%u, q%u, #%u\n", 8 << size, dreg, mreg,
+                      shift_imm);
+               } else {
+                  DIP("vrshrn.i%u d%u, q%u, #%u\n", 8 << size, dreg, mreg,
+                      shift_imm);
+               }
+               return True;
+            }
+         } else {
+            /* fall through */
+         }
+      case 9:
+         dreg = ((theInstr >> 18) & 0x10) | ((theInstr >> 12) & 0xF);
+         mreg = ((theInstr >>  1) & 0x10) | (theInstr & 0xF);
+         if (mreg & 1)
+            return False;
+         mreg >>= 1;
+         size++;
+         if ((theInstr >> 8) & 1) {
+            switch (size) {
+               case 1:
+                  op = U ? Iop_ShrN16x8 : Iop_SarN16x8;
+                  cvt = U ? Iop_QShortenU16Ux8 : Iop_QShortenS16Sx8;
+                  cvt2 = U ? Iop_Longen8Ux8 : Iop_Longen8Sx8;
+                  break;
+               case 2:
+                  op = U ? Iop_ShrN32x4 : Iop_SarN32x4;
+                  cvt = U ? Iop_QShortenU32Ux4 : Iop_QShortenS32Sx4;
+                  cvt2 = U ? Iop_Longen16Ux4 : Iop_Longen16Sx4;
+                  break;
+               case 3:
+                  op = U ? Iop_ShrN64x2 : Iop_SarN64x2;
+                  cvt = U ? Iop_QShortenU64Ux2 : Iop_QShortenS64Sx2;
+                  cvt2 = U ? Iop_Longen32Ux2 : Iop_Longen32Sx2;
+                  break;
+               default:
+                  vassert(0);
+            }
+            DIP("vq%sshrn.%c%u d%u, q%u, #%u\n", B ? "r" : "",
+                U ? 'u' : 's', 8 << size, dreg, mreg, shift_imm);
+         } else {
+            vassert(U);
+            switch (size) {
+               case 1:
+                  op = Iop_SarN16x8;
+                  cvt = Iop_QShortenU16Sx8;
+                  cvt2 = Iop_Longen8Ux8;
+                  break;
+               case 2:
+                  op = Iop_SarN32x4;
+                  cvt = Iop_QShortenU32Sx4;
+                  cvt2 = Iop_Longen16Ux4;
+                  break;
+               case 3:
+                  op = Iop_SarN64x2;
+                  cvt = Iop_QShortenU64Sx2;
+                  cvt2 = Iop_Longen32Ux2;
+                  break;
+               default:
+                  vassert(0);
+            }
+            DIP("vq%sshrun.s%u d%u, q%u, #%u\n", B ? "r" : "",
+                8 << size, dreg, mreg, shift_imm);
+         }
+         if (B) {
+            if (shift_imm > 0) {
+               imm = 1;
+               switch (size) {
+                  case 1: imm = (imm << 16) | imm; /* fall through */
+                  case 2: imm = (imm << 32) | imm; /* fall through */
+                  case 3: break;
+                  case 0: default: vassert(0);
+               }
+               switch (size) {
+                  case 1: add = Iop_Add16x8; break;
+                  case 2: add = Iop_Add32x4; break;
+                  case 3: add = Iop_Add64x2; break;
+                  case 0: default: vassert(0);
+               }
+            }
+         }
+         reg_m = newTemp(Ity_V128);
+         res = newTemp(Ity_V128);
+         assign(reg_m, getQReg(mreg));
+         if (B) {
+            /* VQRSHRN, VQRSHRUN */
+            assign(res, binop(add,
+                              binop(op, mkexpr(reg_m), mkU8(shift_imm)),
+                              binop(Iop_AndV128,
+                                    binop(op,
+                                          mkexpr(reg_m),
+                                          mkU8(shift_imm - 1)),
+                                    mkU128(imm))));
+         } else {
+            /* VQSHRN, VQSHRUN */
+            assign(res, binop(op, mkexpr(reg_m), mkU8(shift_imm)));
+         }
+#ifndef DISABLE_QC_FLAG
+         setFlag_QC(unop(cvt2, unop(cvt, mkexpr(res))), mkexpr(res),
+                    True, condT);
+#endif
+         putDRegI64(dreg, unop(cvt, mkexpr(res)), condT);
+         return True;
+      case 10:
+         /* VSHLL
+            VMOVL ::= VSHLL #0 */
+         if (B)
+            return False;
+         if (dreg & 1)
+            return False;
+         dreg >>= 1;
+         shift_imm = (8 << size) - shift_imm;
+         res = newTemp(Ity_V128);
+         switch (size) {
+            case 0:
+               op = Iop_ShlN16x8;
+               cvt = U ? Iop_Longen8Ux8 : Iop_Longen8Sx8;
+               break;
+            case 1:
+               op = Iop_ShlN32x4;
+               cvt = U ? Iop_Longen16Ux4 : Iop_Longen16Sx4;
+               break;
+            case 2:
+               op = Iop_ShlN64x2;
+               cvt = U ? Iop_Longen32Ux2 : Iop_Longen32Sx2;
+               break;
+            case 3:
+               return False;
+            default:
+               vassert(0);
+         }
+         assign(res, binop(op, unop(cvt, getDRegI64(mreg)), mkU8(shift_imm)));
+         putQReg(dreg, mkexpr(res), condT);
+         if (shift_imm == 0) {
+            DIP("vmovl.%c%u q%u, d%u\n", U ? 'u' : 's', 8 << size,
+                dreg, mreg);
+         } else {
+            DIP("vshll.%c%u q%u, d%u, #%u\n", U ? 'u' : 's', 8 << size,
+                dreg, mreg, shift_imm);
+         }
+         return True;
+      case 14:
+      case 15:
+         /* VCVT floating-point <-> fixed-point */
+         if ((theInstr >> 8) & 1) {
+            if (U) {
+               op = Q ? Iop_F32ToFixed32Ux4_RZ : Iop_F32ToFixed32Ux2_RZ;
+            } else {
+               op = Q ? Iop_F32ToFixed32Sx4_RZ : Iop_F32ToFixed32Sx2_RZ;
+            }
+            DIP("vcvt.%c32.f32 %c%u, %c%u, #%u\n", U ? 'u' : 's',
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg,
+                64 - ((theInstr >> 16) & 0x3f));
+         } else {
+            if (U) {
+               op = Q ? Iop_Fixed32UToF32x4_RN : Iop_Fixed32UToF32x2_RN;
+            } else {
+               op = Q ? Iop_Fixed32SToF32x4_RN : Iop_Fixed32SToF32x2_RN;
+            }
+            DIP("vcvt.f32.%c32 %c%u, %c%u, #%u\n", U ? 'u' : 's',
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg,
+                64 - ((theInstr >> 16) & 0x3f));
+         }
+         if (((theInstr >> 21) & 1) == 0)
+            return False;
+         if (Q) {
+            putQReg(dreg, binop(op, getQReg(mreg),
+                     mkU8(64 - ((theInstr >> 16) & 0x3f))), condT);
+         } else {
+            putDRegI64(dreg, binop(op, getDRegI64(mreg),
+                       mkU8(64 - ((theInstr >> 16) & 0x3f))), condT);
+         }
+         return True;
+      default:
+         return False;
+
+   }
+   return False;
+}
+
+/* A7.4.5 Two registers, miscellaneous */
+static
+Bool dis_neon_data_2reg_misc ( UInt theInstr, IRTemp condT )
+{
+   UInt A = (theInstr >> 16) & 3;
+   UInt B = (theInstr >> 6) & 0x1f;
+   UInt Q = (theInstr >> 6) & 1;
+   UInt U = (theInstr >> 24) & 1;
+   UInt size = (theInstr >> 18) & 3;
+   UInt dreg = get_neon_d_regno(theInstr);
+   UInt mreg = get_neon_m_regno(theInstr);
+   UInt F = (theInstr >> 10) & 1;
+   IRTemp arg_d;
+   IRTemp arg_m;
+   IRTemp res;
+   switch (A) {
+      case 0:
+         if (Q) {
+            arg_m = newTemp(Ity_V128);
+            res = newTemp(Ity_V128);
+            assign(arg_m, getQReg(mreg));
+         } else {
+            arg_m = newTemp(Ity_I64);
+            res = newTemp(Ity_I64);
+            assign(arg_m, getDRegI64(mreg));
+         }
+         switch (B >> 1) {
+            case 0: {
+               /* VREV64 */
+               IROp op;
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_Reverse64_8x16 : Iop_Reverse64_8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_Reverse64_16x8 : Iop_Reverse64_16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_Reverse64_32x4 : Iop_Reverse64_32x2;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+               assign(res, unop(op, mkexpr(arg_m)));
+               DIP("vrev64.%u %c%u, %c%u\n", 8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 1: {
+               /* VREV32 */
+               IROp op;
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_Reverse32_8x16 : Iop_Reverse32_8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_Reverse32_16x8 : Iop_Reverse32_16x4;
+                     break;
+                  case 2:
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+               assign(res, unop(op, mkexpr(arg_m)));
+               DIP("vrev32.%u %c%u, %c%u\n", 8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 2: {
+               /* VREV16 */
+               IROp op;
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_Reverse16_8x16 : Iop_Reverse16_8x8;
+                     break;
+                  case 1:
+                  case 2:
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+               assign(res, unop(op, mkexpr(arg_m)));
+               DIP("vrev16.%u %c%u, %c%u\n", 8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 3:
+               return False;
+            case 4:
+            case 5: {
+               /* VPADDL */
+               IROp op;
+               U = (theInstr >> 7) & 1;
+               if (Q) {
+                  switch (size) {
+                     case 0: op = U ? Iop_PwAddL8Ux16 : Iop_PwAddL8Sx16; break;
+                     case 1: op = U ? Iop_PwAddL16Ux8 : Iop_PwAddL16Sx8; break;
+                     case 2: op = U ? Iop_PwAddL32Ux4 : Iop_PwAddL32Sx4; break;
+                     case 3: return False;
+                     default: vassert(0);
+                  }
+               } else {
+                  switch (size) {
+                     case 0: op = U ? Iop_PwAddL8Ux8  : Iop_PwAddL8Sx8;  break;
+                     case 1: op = U ? Iop_PwAddL16Ux4 : Iop_PwAddL16Sx4; break;
+                     case 2: op = U ? Iop_PwAddL32Ux2 : Iop_PwAddL32Sx2; break;
+                     case 3: return False;
+                     default: vassert(0);
+                  }
+               }
+               assign(res, unop(op, mkexpr(arg_m)));
+               DIP("vpaddl.%c%u %c%u, %c%u\n", U ? 'u' : 's', 8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 6:
+            case 7:
+               return False;
+            case 8: {
+               /* VCLS */
+               IROp op;
+               switch (size) {
+                  case 0: op = Q ? Iop_Cls8Sx16 : Iop_Cls8Sx8; break;
+                  case 1: op = Q ? Iop_Cls16Sx8 : Iop_Cls16Sx4; break;
+                  case 2: op = Q ? Iop_Cls32Sx4 : Iop_Cls32Sx2; break;
+                  case 3: return False;
+                  default: vassert(0);
+               }
+               assign(res, unop(op, mkexpr(arg_m)));
+               DIP("vcls.s%u %c%u, %c%u\n", 8 << size, Q ? 'q' : 'd', dreg,
+                   Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 9: {
+               /* VCLZ */
+               IROp op;
+               switch (size) {
+                  case 0: op = Q ? Iop_Clz8Sx16 : Iop_Clz8Sx8; break;
+                  case 1: op = Q ? Iop_Clz16Sx8 : Iop_Clz16Sx4; break;
+                  case 2: op = Q ? Iop_Clz32Sx4 : Iop_Clz32Sx2; break;
+                  case 3: return False;
+                  default: vassert(0);
+               }
+               assign(res, unop(op, mkexpr(arg_m)));
+               DIP("vclz.i%u %c%u, %c%u\n", 8 << size, Q ? 'q' : 'd', dreg,
+                   Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 10:
+               /* VCNT */
+               assign(res, unop(Q ? Iop_Cnt8x16 : Iop_Cnt8x8, mkexpr(arg_m)));
+               DIP("vcnt.8 %c%u, %c%u\n", Q ? 'q' : 'd', dreg, Q ? 'q' : 'd',
+                   mreg);
+               break;
+            case 11:
+               /* VMVN */
+               if (Q)
+                  assign(res, unop(Iop_NotV128, mkexpr(arg_m)));
+               else
+                  assign(res, unop(Iop_Not64, mkexpr(arg_m)));
+               DIP("vmvn %c%u, %c%u\n", Q ? 'q' : 'd', dreg, Q ? 'q' : 'd',
+                   mreg);
+               break;
+            case 12:
+            case 13: {
+               /* VPADAL */
+               IROp op, add_op;
+               U = (theInstr >> 7) & 1;
+               if (Q) {
+                  switch (size) {
+                     case 0:
+                        op = U ? Iop_PwAddL8Ux16 : Iop_PwAddL8Sx16;
+                        add_op = Iop_Add16x8;
+                        break;
+                     case 1:
+                        op = U ? Iop_PwAddL16Ux8 : Iop_PwAddL16Sx8;
+                        add_op = Iop_Add32x4;
+                        break;
+                     case 2:
+                        op = U ? Iop_PwAddL32Ux4 : Iop_PwAddL32Sx4;
+                        add_op = Iop_Add64x2;
+                        break;
+                     case 3:
+                        return False;
+                     default:
+                        vassert(0);
+                  }
+               } else {
+                  switch (size) {
+                     case 0:
+                        op = U ? Iop_PwAddL8Ux8 : Iop_PwAddL8Sx8;
+                        add_op = Iop_Add16x4;
+                        break;
+                     case 1:
+                        op = U ? Iop_PwAddL16Ux4 : Iop_PwAddL16Sx4;
+                        add_op = Iop_Add32x2;
+                        break;
+                     case 2:
+                        op = U ? Iop_PwAddL32Ux2 : Iop_PwAddL32Sx2;
+                        add_op = Iop_Add64;
+                        break;
+                     case 3:
+                        return False;
+                     default:
+                        vassert(0);
+                  }
+               }
+               if (Q) {
+                  arg_d = newTemp(Ity_V128);
+                  assign(arg_d, getQReg(dreg));
+               } else {
+                  arg_d = newTemp(Ity_I64);
+                  assign(arg_d, getDRegI64(dreg));
+               }
+               assign(res, binop(add_op, unop(op, mkexpr(arg_m)),
+                                         mkexpr(arg_d)));
+               DIP("vpadal.%c%u %c%u, %c%u\n", U ? 'u' : 's', 8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 14: {
+               /* VQABS */
+               IROp op_sub, op_qsub, op_cmp;
+               IRTemp mask, tmp;
+               IRExpr *zero1, *zero2;
+               IRExpr *neg, *neg2;
+               if (Q) {
+                  zero1 = binop(Iop_64HLtoV128, mkU64(0), mkU64(0));
+                  zero2 = binop(Iop_64HLtoV128, mkU64(0), mkU64(0));
+                  mask = newTemp(Ity_V128);
+                  tmp = newTemp(Ity_V128);
+               } else {
+                  zero1 = mkU64(0);
+                  zero2 = mkU64(0);
+                  mask = newTemp(Ity_I64);
+                  tmp = newTemp(Ity_I64);
+               }
+               switch (size) {
+                  case 0:
+                     op_sub = Q ? Iop_Sub8x16 : Iop_Sub8x8;
+                     op_qsub = Q ? Iop_QSub8Sx16 : Iop_QSub8Sx8;
+                     op_cmp = Q ? Iop_CmpGT8Sx16 : Iop_CmpGT8Sx8;
+                     break;
+                  case 1:
+                     op_sub = Q ? Iop_Sub16x8 : Iop_Sub16x4;
+                     op_qsub = Q ? Iop_QSub16Sx8 : Iop_QSub16Sx4;
+                     op_cmp = Q ? Iop_CmpGT16Sx8 : Iop_CmpGT16Sx4;
+                     break;
+                  case 2:
+                     op_sub = Q ? Iop_Sub32x4 : Iop_Sub32x2;
+                     op_qsub = Q ? Iop_QSub32Sx4 : Iop_QSub32Sx2;
+                     op_cmp = Q ? Iop_CmpGT32Sx4 : Iop_CmpGT32Sx2;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+               assign(mask, binop(op_cmp, mkexpr(arg_m), zero1));
+               neg = binop(op_qsub, zero2, mkexpr(arg_m));
+               neg2 = binop(op_sub, zero2, mkexpr(arg_m));
+               assign(res, binop(Q ? Iop_OrV128 : Iop_Or64,
+                                 binop(Q ? Iop_AndV128 : Iop_And64,
+                                       mkexpr(mask),
+                                       mkexpr(arg_m)),
+                                 binop(Q ? Iop_AndV128 : Iop_And64,
+                                       unop(Q ? Iop_NotV128 : Iop_Not64,
+                                            mkexpr(mask)),
+                                       neg)));
+#ifndef DISABLE_QC_FLAG
+               assign(tmp, binop(Q ? Iop_OrV128 : Iop_Or64,
+                                 binop(Q ? Iop_AndV128 : Iop_And64,
+                                       mkexpr(mask),
+                                       mkexpr(arg_m)),
+                                 binop(Q ? Iop_AndV128 : Iop_And64,
+                                       unop(Q ? Iop_NotV128 : Iop_Not64,
+                                            mkexpr(mask)),
+                                       neg2)));
+               setFlag_QC(mkexpr(res), mkexpr(tmp), Q, condT);
+#endif
+               DIP("vqabs.s%u %c%u, %c%u\n", 8 << size, Q ? 'q' : 'd', dreg,
+                   Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 15: {
+               /* VQNEG */
+               IROp op, op2;
+               IRExpr *zero;
+               if (Q) {
+                  zero = binop(Iop_64HLtoV128, mkU64(0), mkU64(0));
+               } else {
+                  zero = mkU64(0);
+               }
+               switch (size) {
+                  case 0:
+                     op = Q ? Iop_QSub8Sx16 : Iop_QSub8Sx8;
+                     op2 = Q ? Iop_Sub8x16 : Iop_Sub8x8;
+                     break;
+                  case 1:
+                     op = Q ? Iop_QSub16Sx8 : Iop_QSub16Sx4;
+                     op2 = Q ? Iop_Sub16x8 : Iop_Sub16x4;
+                     break;
+                  case 2:
+                     op = Q ? Iop_QSub32Sx4 : Iop_QSub32Sx2;
+                     op2 = Q ? Iop_Sub32x4 : Iop_Sub32x2;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+               assign(res, binop(op, zero, mkexpr(arg_m)));
+#ifndef DISABLE_QC_FLAG
+               setFlag_QC(mkexpr(res), binop(op2, zero, mkexpr(arg_m)),
+                          Q, condT);
+#endif
+               DIP("vqneg.s%u %c%u, %c%u\n", 8 << size, Q ? 'q' : 'd', dreg,
+                   Q ? 'q' : 'd', mreg);
+               break;
+            }
+            default:
+               vassert(0);
+         }
+         if (Q) {
+            putQReg(dreg, mkexpr(res), condT);
+         } else {
+            putDRegI64(dreg, mkexpr(res), condT);
+         }
+         return True;
+      case 1:
+         if (Q) {
+            arg_m = newTemp(Ity_V128);
+            res = newTemp(Ity_V128);
+            assign(arg_m, getQReg(mreg));
+         } else {
+            arg_m = newTemp(Ity_I64);
+            res = newTemp(Ity_I64);
+            assign(arg_m, getDRegI64(mreg));
+         }
+         switch ((B >> 1) & 0x7) {
+            case 0: {
+               /* VCGT #0 */
+               IRExpr *zero;
+               IROp op;
+               if (Q) {
+                  zero = binop(Iop_64HLtoV128, mkU64(0), mkU64(0));
+               } else {
+                  zero = mkU64(0);
+               }
+               if (F) {
+                  switch (size) {
+                     case 0: case 1: case 3: return False;
+                     case 2: op = Q ? Iop_CmpGT32Fx4 : Iop_CmpGT32Fx2; break;
+                     default: vassert(0);
+                  }
+               } else {
+                  switch (size) {
+                     case 0: op = Q ? Iop_CmpGT8Sx16 : Iop_CmpGT8Sx8; break;
+                     case 1: op = Q ? Iop_CmpGT16Sx8 : Iop_CmpGT16Sx4; break;
+                     case 2: op = Q ? Iop_CmpGT32Sx4 : Iop_CmpGT32Sx2; break;
+                     case 3: return False;
+                     default: vassert(0);
+                  }
+               }
+               assign(res, binop(op, mkexpr(arg_m), zero));
+               DIP("vcgt.%c%u %c%u, %c%u, #0\n", F ? 'f' : 's', 8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 1: {
+               /* VCGE #0 */
+               IROp op;
+               IRExpr *zero;
+               if (Q) {
+                  zero = binop(Iop_64HLtoV128, mkU64(0), mkU64(0));
+               } else {
+                  zero = mkU64(0);
+               }
+               if (F) {
+                  switch (size) {
+                     case 0: case 1: case 3: return False;
+                     case 2: op = Q ? Iop_CmpGE32Fx4 : Iop_CmpGE32Fx2; break;
+                     default: vassert(0);
+                  }
+                  assign(res, binop(op, mkexpr(arg_m), zero));
+               } else {
+                  switch (size) {
+                     case 0: op = Q ? Iop_CmpGT8Sx16 : Iop_CmpGT8Sx8; break;
+                     case 1: op = Q ? Iop_CmpGT16Sx8 : Iop_CmpGT16Sx4; break;
+                     case 2: op = Q ? Iop_CmpGT32Sx4 : Iop_CmpGT32Sx2; break;
+                     case 3: return False;
+                     default: vassert(0);
+                  }
+                  assign(res, unop(Q ? Iop_NotV128 : Iop_Not64,
+                                   binop(op, zero, mkexpr(arg_m))));
+               }
+               DIP("vcge.%c%u %c%u, %c%u, #0\n", F ? 'f' : 's', 8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 2: {
+               /* VCEQ #0 */
+               IROp op;
+               IRExpr *zero;
+               if (F) {
+                  if (Q) {
+                     zero = binop(Iop_64HLtoV128, mkU64(0), mkU64(0));
+                  } else {
+                     zero = mkU64(0);
+                  }
+                  switch (size) {
+                     case 0: case 1: case 3: return False;
+                     case 2: op = Q ? Iop_CmpEQ32Fx4 : Iop_CmpEQ32Fx2; break;
+                     default: vassert(0);
+                  }
+                  assign(res, binop(op, zero, mkexpr(arg_m)));
+               } else {
+                  switch (size) {
+                     case 0: op = Q ? Iop_CmpNEZ8x16 : Iop_CmpNEZ8x8; break;
+                     case 1: op = Q ? Iop_CmpNEZ16x8 : Iop_CmpNEZ16x4; break;
+                     case 2: op = Q ? Iop_CmpNEZ32x4 : Iop_CmpNEZ32x2; break;
+                     case 3: return False;
+                     default: vassert(0);
+                  }
+                  assign(res, unop(Q ? Iop_NotV128 : Iop_Not64,
+                                   unop(op, mkexpr(arg_m))));
+               }
+               DIP("vceq.%c%u %c%u, %c%u, #0\n", F ? 'f' : 'i', 8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 3: {
+               /* VCLE #0 */
+               IRExpr *zero;
+               IROp op;
+               if (Q) {
+                  zero = binop(Iop_64HLtoV128, mkU64(0), mkU64(0));
+               } else {
+                  zero = mkU64(0);
+               }
+               if (F) {
+                  switch (size) {
+                     case 0: case 1: case 3: return False;
+                     case 2: op = Q ? Iop_CmpGE32Fx4 : Iop_CmpGE32Fx2; break;
+                     default: vassert(0);
+                  }
+                  assign(res, binop(op, zero, mkexpr(arg_m)));
+               } else {
+                  switch (size) {
+                     case 0: op = Q ? Iop_CmpGT8Sx16 : Iop_CmpGT8Sx8; break;
+                     case 1: op = Q ? Iop_CmpGT16Sx8 : Iop_CmpGT16Sx4; break;
+                     case 2: op = Q ? Iop_CmpGT32Sx4 : Iop_CmpGT32Sx2; break;
+                     case 3: return False;
+                     default: vassert(0);
+                  }
+                  assign(res, unop(Q ? Iop_NotV128 : Iop_Not64,
+                                   binop(op, mkexpr(arg_m), zero)));
+               }
+               DIP("vcle.%c%u %c%u, %c%u, #0\n", F ? 'f' : 's', 8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 4: {
+               /* VCLT #0 */
+               IROp op;
+               IRExpr *zero;
+               if (Q) {
+                  zero = binop(Iop_64HLtoV128, mkU64(0), mkU64(0));
+               } else {
+                  zero = mkU64(0);
+               }
+               if (F) {
+                  switch (size) {
+                     case 0: case 1: case 3: return False;
+                     case 2: op = Q ? Iop_CmpGT32Fx4 : Iop_CmpGT32Fx2; break;
+                     default: vassert(0);
+                  }
+                  assign(res, binop(op, zero, mkexpr(arg_m)));
+               } else {
+                  switch (size) {
+                     case 0: op = Q ? Iop_CmpGT8Sx16 : Iop_CmpGT8Sx8; break;
+                     case 1: op = Q ? Iop_CmpGT16Sx8 : Iop_CmpGT16Sx4; break;
+                     case 2: op = Q ? Iop_CmpGT32Sx4 : Iop_CmpGT32Sx2; break;
+                     case 3: return False;
+                     default: vassert(0);
+                  }
+                  assign(res, binop(op, zero, mkexpr(arg_m)));
+               }
+               DIP("vclt.%c%u %c%u, %c%u, #0\n", F ? 'f' : 's', 8 << size,
+                   Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 5:
+               return False;
+            case 6: {
+               /* VABS */
+               if (!F) {
+                  IROp op;
+                  switch(size) {
+                     case 0: op = Q ? Iop_Abs8x16 : Iop_Abs8x8; break;
+                     case 1: op = Q ? Iop_Abs16x8 : Iop_Abs16x4; break;
+                     case 2: op = Q ? Iop_Abs32x4 : Iop_Abs32x2; break;
+                     case 3: return False;
+                     default: vassert(0);
+                  }
+                  assign(res, unop(op, mkexpr(arg_m)));
+               } else {
+                  assign(res, unop(Q ? Iop_Abs32Fx4 : Iop_Abs32Fx2,
+                                   mkexpr(arg_m)));
+               }
+               DIP("vabs.%c%u %c%u, %c%u\n",
+                   F ? 'f' : 's', 8 << size, Q ? 'q' : 'd', dreg,
+                   Q ? 'q' : 'd', mreg);
+               break;
+            }
+            case 7: {
+               /* VNEG */
+               IROp op;
+               IRExpr *zero;
+               if (F) {
+                  switch (size) {
+                     case 0: case 1: case 3: return False;
+                     case 2: op = Q ? Iop_Neg32Fx4 : Iop_Neg32Fx2; break;
+                     default: vassert(0);
+                  }
+                  assign(res, unop(op, mkexpr(arg_m)));
+               } else {
+                  if (Q) {
+                     zero = binop(Iop_64HLtoV128, mkU64(0), mkU64(0));
+                  } else {
+                     zero = mkU64(0);
+                  }
+                  switch (size) {
+                     case 0: op = Q ? Iop_Sub8x16 : Iop_Sub8x8; break;
+                     case 1: op = Q ? Iop_Sub16x8 : Iop_Sub16x4; break;
+                     case 2: op = Q ? Iop_Sub32x4 : Iop_Sub32x2; break;
+                     case 3: return False;
+                     default: vassert(0);
+                  }
+                  assign(res, binop(op, zero, mkexpr(arg_m)));
+               }
+               DIP("vneg.%c%u %c%u, %c%u\n",
+                   F ? 'f' : 's', 8 << size, Q ? 'q' : 'd', dreg,
+                   Q ? 'q' : 'd', mreg);
+               break;
+            }
+            default:
+               vassert(0);
+         }
+         if (Q) {
+            putQReg(dreg, mkexpr(res), condT);
+         } else {
+            putDRegI64(dreg, mkexpr(res), condT);
+         }
+         return True;
+      case 2:
+         if ((B >> 1) == 0) {
+            /* VSWP */
+            if (Q) {
+               arg_m = newTemp(Ity_V128);
+               assign(arg_m, getQReg(mreg));
+               putQReg(mreg, getQReg(dreg), condT);
+               putQReg(dreg, mkexpr(arg_m), condT);
+            } else {
+               arg_m = newTemp(Ity_I64);
+               assign(arg_m, getDRegI64(mreg));
+               putDRegI64(mreg, getDRegI64(dreg), condT);
+               putDRegI64(dreg, mkexpr(arg_m), condT);
+            }
+            DIP("vswp %c%u, %c%u\n",
+                Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+            return True;
+         } else if ((B >> 1) == 1) {
+            /* VTRN */
+            IROp op_lo, op_hi;
+            IRTemp res1, res2;
+            if (Q) {
+               arg_m = newTemp(Ity_V128);
+               arg_d = newTemp(Ity_V128);
+               res1 = newTemp(Ity_V128);
+               res2 = newTemp(Ity_V128);
+               assign(arg_m, getQReg(mreg));
+               assign(arg_d, getQReg(dreg));
+            } else {
+               res1 = newTemp(Ity_I64);
+               res2 = newTemp(Ity_I64);
+               arg_m = newTemp(Ity_I64);
+               arg_d = newTemp(Ity_I64);
+               assign(arg_m, getDRegI64(mreg));
+               assign(arg_d, getDRegI64(dreg));
+            }
+            if (Q) {
+               switch (size) {
+                  case 0:
+                     op_lo = Iop_InterleaveOddLanes8x16;
+                     op_hi = Iop_InterleaveEvenLanes8x16;
+                     break;
+                  case 1:
+                     op_lo = Iop_InterleaveOddLanes16x8;
+                     op_hi = Iop_InterleaveEvenLanes16x8;
+                     break;
+                  case 2:
+                     op_lo = Iop_InterleaveOddLanes32x4;
+                     op_hi = Iop_InterleaveEvenLanes32x4;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+            } else {
+               switch (size) {
+                  case 0:
+                     op_lo = Iop_InterleaveOddLanes8x8;
+                     op_hi = Iop_InterleaveEvenLanes8x8;
+                     break;
+                  case 1:
+                     op_lo = Iop_InterleaveOddLanes16x4;
+                     op_hi = Iop_InterleaveEvenLanes16x4;
+                     break;
+                  case 2:
+                     op_lo = Iop_InterleaveLO32x2;
+                     op_hi = Iop_InterleaveHI32x2;
+                     break;
+                  case 3:
+                     return False;
+                  default:
+                     vassert(0);
+               }
+            }
+            assign(res1, binop(op_lo, mkexpr(arg_m), mkexpr(arg_d)));
+            assign(res2, binop(op_hi, mkexpr(arg_m), mkexpr(arg_d)));
+            if (Q) {
+               putQReg(dreg, mkexpr(res1), condT);
+               putQReg(mreg, mkexpr(res2), condT);
+            } else {
+               putDRegI64(dreg, mkexpr(res1), condT);
+               putDRegI64(mreg, mkexpr(res2), condT);
+            }
+            DIP("vtrn.%u %c%u, %c%u\n",
+                8 << size, Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+            return True;
+         } else if ((B >> 1) == 2) {
+            /* VUZP */
+            IROp op_lo, op_hi;
+            IRTemp res1, res2;
+            if (!Q && size == 2)
+               return False;
+            if (Q) {
+               arg_m = newTemp(Ity_V128);
+               arg_d = newTemp(Ity_V128);
+               res1 = newTemp(Ity_V128);
+               res2 = newTemp(Ity_V128);
+               assign(arg_m, getQReg(mreg));
+               assign(arg_d, getQReg(dreg));
+            } else {
+               res1 = newTemp(Ity_I64);
+               res2 = newTemp(Ity_I64);
+               arg_m = newTemp(Ity_I64);
+               arg_d = newTemp(Ity_I64);
+               assign(arg_m, getDRegI64(mreg));
+               assign(arg_d, getDRegI64(dreg));
+            }
+            switch (size) {
+               case 0:
+                  op_lo = Q ? Iop_CatOddLanes8x16 : Iop_CatOddLanes8x8;
+                  op_hi = Q ? Iop_CatEvenLanes8x16 : Iop_CatEvenLanes8x8;
+                  break;
+               case 1:
+                  op_lo = Q ? Iop_CatOddLanes16x8 : Iop_CatOddLanes16x4;
+                  op_hi = Q ? Iop_CatEvenLanes16x8 : Iop_CatEvenLanes16x4;
+                  break;
+               case 2:
+                  op_lo = Iop_CatOddLanes32x4;
+                  op_hi = Iop_CatEvenLanes32x4;
+                  break;
+               case 3:
+                  return False;
+               default:
+                  vassert(0);
+            }
+            assign(res1, binop(op_lo, mkexpr(arg_m), mkexpr(arg_d)));
+            assign(res2, binop(op_hi, mkexpr(arg_m), mkexpr(arg_d)));
+            if (Q) {
+               putQReg(dreg, mkexpr(res1), condT);
+               putQReg(mreg, mkexpr(res2), condT);
+            } else {
+               putDRegI64(dreg, mkexpr(res1), condT);
+               putDRegI64(mreg, mkexpr(res2), condT);
+            }
+            DIP("vuzp.%u %c%u, %c%u\n",
+                8 << size, Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+            return True;
+         } else if ((B >> 1) == 3) {
+            /* VZIP */
+            IROp op_lo, op_hi;
+            IRTemp res1, res2;
+            if (!Q && size == 2)
+               return False;
+            if (Q) {
+               arg_m = newTemp(Ity_V128);
+               arg_d = newTemp(Ity_V128);
+               res1 = newTemp(Ity_V128);
+               res2 = newTemp(Ity_V128);
+               assign(arg_m, getQReg(mreg));
+               assign(arg_d, getQReg(dreg));
+            } else {
+               res1 = newTemp(Ity_I64);
+               res2 = newTemp(Ity_I64);
+               arg_m = newTemp(Ity_I64);
+               arg_d = newTemp(Ity_I64);
+               assign(arg_m, getDRegI64(mreg));
+               assign(arg_d, getDRegI64(dreg));
+            }
+            switch (size) {
+               case 0:
+                  op_lo = Q ? Iop_InterleaveHI8x16 : Iop_InterleaveHI8x8;
+                  op_hi = Q ? Iop_InterleaveLO8x16 : Iop_InterleaveLO8x8;
+                  break;
+               case 1:
+                  op_lo = Q ? Iop_InterleaveHI16x8 : Iop_InterleaveHI16x4;
+                  op_hi = Q ? Iop_InterleaveLO16x8 : Iop_InterleaveLO16x4;
+                  break;
+               case 2:
+                  op_lo = Iop_InterleaveHI32x4;
+                  op_hi = Iop_InterleaveLO32x4;
+                  break;
+               case 3:
+                  return False;
+               default:
+                  vassert(0);
+            }
+            assign(res1, binop(op_lo, mkexpr(arg_m), mkexpr(arg_d)));
+            assign(res2, binop(op_hi, mkexpr(arg_m), mkexpr(arg_d)));
+            if (Q) {
+               putQReg(dreg, mkexpr(res1), condT);
+               putQReg(mreg, mkexpr(res2), condT);
+            } else {
+               putDRegI64(dreg, mkexpr(res1), condT);
+               putDRegI64(mreg, mkexpr(res2), condT);
+            }
+            DIP("vzip.%u %c%u, %c%u\n",
+                8 << size, Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+            return True;
+         } else if (B == 8) {
+            /* VMOVN */
+            IROp op;
+            mreg >>= 1;
+            switch (size) {
+               case 0: op = Iop_Shorten16x8; break;
+               case 1: op = Iop_Shorten32x4; break;
+               case 2: op = Iop_Shorten64x2; break;
+               case 3: return False;
+               default: vassert(0);
+            }
+            putDRegI64(dreg, unop(op, getQReg(mreg)), condT);
+            DIP("vmovn.i%u d%u, q%u\n", 16 << size, dreg, mreg);
+            return True;
+         } else if (B == 9 || (B >> 1) == 5) {
+            /* VQMOVN, VQMOVUN */
+            IROp op, op2;
+            IRTemp tmp;
+            dreg = ((theInstr >> 18) & 0x10) | ((theInstr >> 12) & 0xF);
+            mreg = ((theInstr >> 1) & 0x10) | (theInstr & 0xF);
+            if (mreg & 1)
+               return False;
+            mreg >>= 1;
+            switch (size) {
+               case 0: op2 = Iop_Shorten16x8; break;
+               case 1: op2 = Iop_Shorten32x4; break;
+               case 2: op2 = Iop_Shorten64x2; break;
+               case 3: return False;
+               default: vassert(0);
+            }
+            switch (B & 3) {
+               case 0:
+                  vassert(0);
+               case 1:
+                  switch (size) {
+                     case 0: op = Iop_QShortenU16Sx8; break;
+                     case 1: op = Iop_QShortenU32Sx4; break;
+                     case 2: op = Iop_QShortenU64Sx2; break;
+                     case 3: return False;
+                     default: vassert(0);
+                  }
+                  DIP("vqmovun.s%u d%u, q%u\n", 16 << size, dreg, mreg);
+                  break;
+               case 2:
+                  switch (size) {
+                     case 0: op = Iop_QShortenS16Sx8; break;
+                     case 1: op = Iop_QShortenS32Sx4; break;
+                     case 2: op = Iop_QShortenS64Sx2; break;
+                     case 3: return False;
+                     default: vassert(0);
+                  }
+                  DIP("vqmovn.s%u d%u, q%u\n", 16 << size, dreg, mreg);
+                  break;
+               case 3:
+                  switch (size) {
+                     case 0: op = Iop_QShortenU16Ux8; break;
+                     case 1: op = Iop_QShortenU32Ux4; break;
+                     case 2: op = Iop_QShortenU64Ux2; break;
+                     case 3: return False;
+                     default: vassert(0);
+                  }
+                  DIP("vqmovn.u%u d%u, q%u\n", 16 << size, dreg, mreg);
+                  break;
+               default:
+                  vassert(0);
+            }
+            res = newTemp(Ity_I64);
+            tmp = newTemp(Ity_I64);
+            assign(res, unop(op, getQReg(mreg)));
+#ifndef DISABLE_QC_FLAG
+            assign(tmp, unop(op2, getQReg(mreg)));
+            setFlag_QC(mkexpr(res), mkexpr(tmp), False, condT);
+#endif
+            putDRegI64(dreg, mkexpr(res), condT);
+            return True;
+         } else if (B == 12) {
+            /* VSHLL (maximum shift) */
+            IROp op, cvt;
+            UInt shift_imm;
+            if (Q)
+               return False;
+            if (dreg & 1)
+               return False;
+            dreg >>= 1;
+            shift_imm = 8 << size;
+            res = newTemp(Ity_V128);
+            switch (size) {
+               case 0: op = Iop_ShlN16x8; cvt = Iop_Longen8Ux8; break;
+               case 1: op = Iop_ShlN32x4; cvt = Iop_Longen16Ux4; break;
+               case 2: op = Iop_ShlN64x2; cvt = Iop_Longen32Ux2; break;
+               case 3: return False;
+               default: vassert(0);
+            }
+            assign(res, binop(op, unop(cvt, getDRegI64(mreg)),
+                                  mkU8(shift_imm)));
+            putQReg(dreg, mkexpr(res), condT);
+            DIP("vshll.i%u q%u, d%u, #%u\n", 8 << size, dreg, mreg, 8 << size);
+            return True;
+         } else if ((B >> 3) == 3 && (B & 3) == 0) {
+            /* VCVT (half<->single) */
+            /* Half-precision extensions are needed to run this */
+            vassert(0); // ATC
+            if (((theInstr >> 18) & 3) != 1)
+               return False;
+            if ((theInstr >> 8) & 1) {
+               if (dreg & 1)
+                  return False;
+               dreg >>= 1;
+               putQReg(dreg, unop(Iop_F16toF32x4, getDRegI64(mreg)),
+                     condT);
+               DIP("vcvt.f32.f16 q%u, d%u\n", dreg, mreg);
+            } else {
+               if (mreg & 1)
+                  return False;
+               mreg >>= 1;
+               putDRegI64(dreg, unop(Iop_F32toF16x4, getQReg(mreg)),
+                                condT);
+               DIP("vcvt.f16.f32 d%u, q%u\n", dreg, mreg);
+            }
+            return True;
+         } else {
+            return False;
+         }
+         vassert(0);
+         return True;
+      case 3:
+         if (((B >> 1) & BITS4(1,1,0,1)) == BITS4(1,0,0,0)) {
+            /* VRECPE */
+            IROp op;
+            F = (theInstr >> 8) & 1;
+            if (size != 2)
+               return False;
+            if (Q) {
+               op = F ? Iop_Recip32Fx4 : Iop_Recip32x4;
+               putQReg(dreg, unop(op, getQReg(mreg)), condT);
+               DIP("vrecpe.%c32 q%u, q%u\n", F ? 'f' : 'u', dreg, mreg);
+            } else {
+               op = F ? Iop_Recip32Fx2 : Iop_Recip32x2;
+               putDRegI64(dreg, unop(op, getDRegI64(mreg)), condT);
+               DIP("vrecpe.%c32 d%u, d%u\n", F ? 'f' : 'u', dreg, mreg);
+            }
+            return True;
+         } else if (((B >> 1) & BITS4(1,1,0,1)) == BITS4(1,0,0,1)) {
+            /* VRSQRTE */
+            IROp op;
+            F = (B >> 2) & 1;
+            if (size != 2)
+               return False;
+            if (F) {
+               /* fp */
+               op = Q ? Iop_Rsqrte32Fx4 : Iop_Rsqrte32Fx2;
+            } else {
+               /* unsigned int */
+               op = Q ? Iop_Rsqrte32x4 : Iop_Rsqrte32x2;
+            }
+            if (Q) {
+               putQReg(dreg, unop(op, getQReg(mreg)), condT);
+               DIP("vrsqrte.%c32 q%u, q%u\n", F ? 'f' : 'u', dreg, mreg);
+            } else {
+               putDRegI64(dreg, unop(op, getDRegI64(mreg)), condT);
+               DIP("vrsqrte.%c32 d%u, d%u\n", F ? 'f' : 'u', dreg, mreg);
+            }
+            return True;
+         } else if ((B >> 3) == 3) {
+            /* VCVT (fp<->integer) */
+            IROp op;
+            if (size != 2)
+               return False;
+            switch ((B >> 1) & 3) {
+               case 0:
+                  op = Q ? Iop_I32StoFx4 : Iop_I32StoFx2;
+                  DIP("vcvt.f32.s32 %c%u, %c%u\n",
+                      Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+                  break;
+               case 1:
+                  op = Q ? Iop_I32UtoFx4 : Iop_I32UtoFx2;
+                  DIP("vcvt.f32.u32 %c%u, %c%u\n",
+                      Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+                  break;
+               case 2:
+                  op = Q ? Iop_FtoI32Sx4_RZ : Iop_FtoI32Sx2_RZ;
+                  DIP("vcvt.s32.f32 %c%u, %c%u\n",
+                      Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+                  break;
+               case 3:
+                  op = Q ? Iop_FtoI32Ux4_RZ : Iop_FtoI32Ux2_RZ;
+                  DIP("vcvt.u32.f32 %c%u, %c%u\n",
+                      Q ? 'q' : 'd', dreg, Q ? 'q' : 'd', mreg);
+                  break;
+               default:
+                  vassert(0);
+            }
+            if (Q) {
+               putQReg(dreg, unop(op, getQReg(mreg)), condT);
+            } else {
+               putDRegI64(dreg, unop(op, getDRegI64(mreg)), condT);
+            }
+            return True;
+         } else {
+            return False;
+         }
+         vassert(0);
+         return True;
+      default:
+         vassert(0);
+   }
+   return False;
+}
+
+/* A7.4.6 One register and a modified immediate value */
+static
+void ppNeonImm(UInt imm, UInt cmode, UInt op)
+{
+   int i;
+   switch (cmode) {
+      case 0: case 1: case 8: case 9:
+         vex_printf("0x%x", imm);
+         break;
+      case 2: case 3: case 10: case 11:
+         vex_printf("0x%x00", imm);
+         break;
+      case 4: case 5:
+         vex_printf("0x%x0000", imm);
+         break;
+      case 6: case 7:
+         vex_printf("0x%x000000", imm);
+         break;
+      case 12:
+         vex_printf("0x%xff", imm);
+         break;
+      case 13:
+         vex_printf("0x%xffff", imm);
+         break;
+      case 14:
+         if (op) {
+            vex_printf("0x");
+            for (i = 7; i >= 0; i--)
+               vex_printf("%s", (imm & (1 << i)) ? "ff" : "00");
+         } else {
+            vex_printf("0x%x", imm);
+         }
+         break;
+      case 15:
+         vex_printf("0x%x", imm);
+         break;
+   }
+}
+
+static
+const char *ppNeonImmType(UInt cmode, UInt op)
+{
+   switch (cmode) {
+      case 0 ... 7:
+      case 12: case 13:
+         return "i32";
+      case 8 ... 11:
+         return "i16";
+      case 14:
+         if (op)
+            return "i64";
+         else
+            return "i8";
+      case 15:
+         if (op)
+            vassert(0);
+         else
+            return "f32";
+      default:
+         vassert(0);
+   }
+}
+
+static
+void DIPimm(UInt imm, UInt cmode, UInt op,
+            const char *instr, UInt Q, UInt dreg)
+{
+   if (vex_traceflags & VEX_TRACE_FE) {
+      vex_printf("%s.%s %c%u, #", instr,
+                 ppNeonImmType(cmode, op), Q ? 'q' : 'd', dreg);
+      ppNeonImm(imm, cmode, op);
+      vex_printf("\n");
+   }
+}
+
+static
+Bool dis_neon_data_1reg_and_imm ( UInt theInstr, IRTemp condT )
+{
+   UInt dreg = get_neon_d_regno(theInstr);
+   ULong imm_raw = ((theInstr >> 17) & 0x80) | ((theInstr >> 12) & 0x70) |
+                  (theInstr & 0xf);
+   ULong imm_raw_pp = imm_raw;
+   UInt cmode = (theInstr >> 8) & 0xf;
+   UInt op_bit = (theInstr >> 5) & 1;
+   ULong imm = 0;
+   UInt Q = (theInstr >> 6) & 1;
+   int i, j;
+   UInt tmp;
+   IRExpr *imm_val;
+   IRExpr *expr;
+   IRTemp tmp_var;
+   switch(cmode) {
+      case 7: case 6:
+         imm_raw = imm_raw << 8;
+         /* fallthrough */
+      case 5: case 4:
+         imm_raw = imm_raw << 8;
+         /* fallthrough */
+      case 3: case 2:
+         imm_raw = imm_raw << 8;
+         /* fallthrough */
+      case 0: case 1:
+         imm = (imm_raw << 32) | imm_raw;
+         break;
+      case 11: case 10:
+         imm_raw = imm_raw << 8;
+         /* fallthrough */
+      case 9: case 8:
+         imm_raw = (imm_raw << 16) | imm_raw;
+         imm = (imm_raw << 32) | imm_raw;
+         break;
+      case 13:
+         imm_raw = (imm_raw << 8) | 0xff;
+         /* fallthrough */
+      case 12:
+         imm_raw = (imm_raw << 8) | 0xff;
+         imm = (imm_raw << 32) | imm_raw;
+         break;
+      case 14:
+         if (! op_bit) {
+            for(i = 0; i < 8; i++) {
+               imm = (imm << 8) | imm_raw;
+            }
+         } else {
+            for(i = 7; i >= 0; i--) {
+               tmp = 0;
+               for(j = 0; j < 8; j++) {
+                  tmp = (tmp << 1) | ((imm_raw >> i) & 1);
+               }
+               imm = (imm << 8) | tmp;
+            }
+         }
+         break;
+      case 15:
+         imm = (imm_raw & 0x80) << 5;
+         imm |= ~((imm_raw & 0x40) << 5);
+         for(i = 1; i <= 4; i++)
+            imm |= (imm_raw & 0x40) << i;
+         imm |= (imm_raw & 0x7f);
+         imm = imm << 19;
+         imm = (imm << 32) | imm;
+         break;
+      default:
+         return False;
+   }
+   if (Q) {
+      imm_val = binop(Iop_64HLtoV128, mkU64(imm), mkU64(imm));
+   } else {
+      imm_val = mkU64(imm);
+   }
+   if (((op_bit == 0) &&
+      (((cmode & 9) == 0) || ((cmode & 13) == 8) || ((cmode & 12) == 12))) ||
+      ((op_bit == 1) && (cmode == 14))) {
+      /* VMOV (immediate) */
+      if (Q) {
+         putQReg(dreg, imm_val, condT);
+      } else {
+         putDRegI64(dreg, imm_val, condT);
+      }
+      DIPimm(imm_raw_pp, cmode, op_bit, "vmov", Q, dreg);
+      return True;
+   }
+   if ((op_bit == 1) &&
+      (((cmode & 9) == 0) || ((cmode & 13) == 8) || ((cmode & 14) == 12))) {
+      /* VMVN (immediate) */
+      if (Q) {
+         putQReg(dreg, unop(Iop_NotV128, imm_val), condT);
+      } else {
+         putDRegI64(dreg, unop(Iop_Not64, imm_val), condT);
+      }
+      DIPimm(imm_raw_pp, cmode, op_bit, "vmvn", Q, dreg);
+      return True;
+   }
+   if (Q) {
+      tmp_var = newTemp(Ity_V128);
+      assign(tmp_var, getQReg(dreg));
+   } else {
+      tmp_var = newTemp(Ity_I64);
+      assign(tmp_var, getDRegI64(dreg));
+   }
+   if ((op_bit == 0) && (((cmode & 9) == 1) || ((cmode & 13) == 9))) {
+      /* VORR (immediate) */
+      if (Q)
+         expr = binop(Iop_OrV128, mkexpr(tmp_var), imm_val);
+      else
+         expr = binop(Iop_Or64, mkexpr(tmp_var), imm_val);
+      DIPimm(imm_raw_pp, cmode, op_bit, "vorr", Q, dreg);
+   } else if ((op_bit == 1) && (((cmode & 9) == 1) || ((cmode & 13) == 9))) {
+      /* VBIC (immediate) */
+      if (Q)
+         expr = binop(Iop_AndV128, mkexpr(tmp_var),
+                                   unop(Iop_NotV128, imm_val));
+      else
+         expr = binop(Iop_And64, mkexpr(tmp_var), unop(Iop_Not64, imm_val));
+      DIPimm(imm_raw_pp, cmode, op_bit, "vbic", Q, dreg);
+   } else {
+      return False;
+   }
+   if (Q)
+      putQReg(dreg, expr, condT);
+   else
+      putDRegI64(dreg, expr, condT);
+   return True;
+}
+
+/* A7.4 Advanced SIMD data-processing instructions */
+static
+Bool dis_neon_data_processing ( UInt theInstr, IRTemp condT )
+{
+   UInt A = (theInstr >> 19) & 0x1F;
+   UInt B = (theInstr >>  8) & 0xF;
+   UInt C = (theInstr >>  4) & 0xF;
+   UInt U = (theInstr >> 24) & 0x1;
+
+   if (! (A & 0x10)) {
+      return dis_neon_data_3same(theInstr, condT);
+   }
+   if (((A & 0x17) == 0x10) && ((C & 0x9) == 0x1)) {
+      return dis_neon_data_1reg_and_imm(theInstr, condT);
+   }
+   if ((C & 1) == 1) {
+      return dis_neon_data_2reg_and_shift(theInstr, condT);
+   }
+   if (((C & 5) == 0) && (((A & 0x14) == 0x10) || ((A & 0x16) == 0x14))) {
+      return dis_neon_data_3diff(theInstr, condT);
+   }
+   if (((C & 5) == 4) && (((A & 0x14) == 0x10) || ((A & 0x16) == 0x14))) {
+      return dis_neon_data_2reg_and_scalar(theInstr, condT);
+   }
+   if ((A & 0x16) == 0x16) {
+      if ((U == 0) && ((C & 1) == 0)) {
+         return dis_neon_vext(theInstr, condT);
+      }
+      if ((U != 1) || ((C & 1) == 1))
+         return False;
+      if ((B & 8) == 0) {
+         return dis_neon_data_2reg_misc(theInstr, condT);
+      }
+      if ((B & 12) == 8) {
+         return dis_neon_vtb(theInstr, condT);
+      }
+      if ((B == 12) && ((C & 9) == 0)) {
+         return dis_neon_vdup(theInstr, condT);
+      }
+   }
+   return False;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- NEON loads and stores                                ---*/
+/*------------------------------------------------------------*/
+
+/* For NEON memory operations, we use the standard scheme to handle
+   conditionalisation: generate a jump around the instruction if the
+   condition is false.  That's only necessary in Thumb mode, however,
+   since in ARM mode NEON instructions are unconditional. */
+
+/* A helper function for what follows.  It assumes we already went
+   uncond as per comments at the top of this section. */
+static
+void mk_neon_elem_load_to_one_lane( UInt rD, UInt inc, UInt index,
+                                    UInt N, UInt size, IRTemp addr )
+{
+   UInt i;
+   switch (size) {
+      case 0:
+         putDRegI64(rD, triop(Iop_SetElem8x8, getDRegI64(rD), mkU8(index),
+                    loadLE(Ity_I8, mkexpr(addr))), IRTemp_INVALID);
+         break;
+      case 1:
+         putDRegI64(rD, triop(Iop_SetElem16x4, getDRegI64(rD), mkU8(index),
+                    loadLE(Ity_I16, mkexpr(addr))), IRTemp_INVALID);
+         break;
+      case 2:
+         putDRegI64(rD, triop(Iop_SetElem32x2, getDRegI64(rD), mkU8(index),
+                    loadLE(Ity_I32, mkexpr(addr))), IRTemp_INVALID);
+         break;
+      default:
+         vassert(0);
+   }
+   for (i = 1; i <= N; i++) {
+      switch (size) {
+         case 0:
+            putDRegI64(rD + i * inc,
+                       triop(Iop_SetElem8x8,
+                             getDRegI64(rD + i * inc),
+                             mkU8(index),
+                             loadLE(Ity_I8, binop(Iop_Add32,
+                                                  mkexpr(addr),
+                                                  mkU32(i * 1)))),
+                       IRTemp_INVALID);
+            break;
+         case 1:
+            putDRegI64(rD + i * inc,
+                       triop(Iop_SetElem16x4,
+                             getDRegI64(rD + i * inc),
+                             mkU8(index),
+                             loadLE(Ity_I16, binop(Iop_Add32,
+                                                   mkexpr(addr),
+                                                   mkU32(i * 2)))),
+                       IRTemp_INVALID);
+            break;
+         case 2:
+            putDRegI64(rD + i * inc,
+                       triop(Iop_SetElem32x2,
+                             getDRegI64(rD + i * inc),
+                             mkU8(index),
+                             loadLE(Ity_I32, binop(Iop_Add32,
+                                                   mkexpr(addr),
+                                                   mkU32(i * 4)))),
+                       IRTemp_INVALID);
+            break;
+         default:
+            vassert(0);
+      }
+   }
+}
+
+/* A(nother) helper function for what follows.  It assumes we already
+   went uncond as per comments at the top of this section. */
+static
+void mk_neon_elem_store_from_one_lane( UInt rD, UInt inc, UInt index,
+                                       UInt N, UInt size, IRTemp addr )
+{
+   UInt i;
+   switch (size) {
+      case 0:
+         storeLE(mkexpr(addr),
+                 binop(Iop_GetElem8x8, getDRegI64(rD), mkU8(index)));
+         break;
+      case 1:
+         storeLE(mkexpr(addr),
+                 binop(Iop_GetElem16x4, getDRegI64(rD), mkU8(index)));
+         break;
+      case 2:
+         storeLE(mkexpr(addr),
+                 binop(Iop_GetElem32x2, getDRegI64(rD), mkU8(index)));
+         break;
+      default:
+         vassert(0);
+   }
+   for (i = 1; i <= N; i++) {
+      switch (size) {
+         case 0:
+            storeLE(binop(Iop_Add32, mkexpr(addr), mkU32(i * 1)),
+                    binop(Iop_GetElem8x8, getDRegI64(rD + i * inc),
+                                          mkU8(index)));
+            break;
+         case 1:
+            storeLE(binop(Iop_Add32, mkexpr(addr), mkU32(i * 2)),
+                    binop(Iop_GetElem16x4, getDRegI64(rD + i * inc),
+                                           mkU8(index)));
+            break;
+         case 2:
+            storeLE(binop(Iop_Add32, mkexpr(addr), mkU32(i * 4)),
+                    binop(Iop_GetElem32x2, getDRegI64(rD + i * inc),
+                                           mkU8(index)));
+            break;
+         default:
+            vassert(0);
+      }
+   }
+}
+
+/* A7.7 Advanced SIMD element or structure load/store instructions */
+static
+Bool dis_neon_elem_or_struct_load ( UInt theInstr,
+                                    Bool isT, IRTemp condT )
+{
+#  define INSN(_bMax,_bMin)  SLICE_UInt(theInstr, (_bMax), (_bMin))
+   UInt A = INSN(23,23);
+   UInt B = INSN(11,8);
+   UInt L = INSN(21,21);
+   UInt rD = (INSN(22,22) << 4) | INSN(15,12);
+   UInt rN = INSN(19,16);
+   UInt rM = INSN(3,0);
+   UInt N, size, i, j;
+   UInt inc;
+   UInt regs = 1;
+
+   if (isT) {
+      vassert(condT != IRTemp_INVALID);
+   } else {
+      vassert(condT == IRTemp_INVALID);
+   }
+   /* So now, if condT is not IRTemp_INVALID, we know we're
+      dealing with Thumb code. */
+
+   if (INSN(20,20) != 0)
+      return False;
+
+   IRTemp initialRn = newTemp(Ity_I32);
+   assign(initialRn, isT ? getIRegT(rN) : getIRegA(rN));
+
+   IRTemp initialRm = newTemp(Ity_I32);
+   assign(initialRm, isT ? getIRegT(rM) : getIRegA(rM));
+
+   if (A) {
+      N = B & 3;
+      if ((B >> 2) < 3) {
+         /* VSTn / VLDn (n-element structure from/to one lane) */
+
+         size = B >> 2;
+
+         switch (size) {
+            case 0: i = INSN(7,5); inc = 1; break;
+            case 1: i = INSN(7,6); inc = INSN(5,5) ? 2 : 1; break;
+            case 2: i = INSN(7,7); inc = INSN(6,6) ? 2 : 1; break;
+            case 3: return False;
+            default: vassert(0);
+         }
+
+         IRTemp addr = newTemp(Ity_I32);
+         assign(addr, mkexpr(initialRn));
+
+         // go uncond
+         if (condT != IRTemp_INVALID)
+            mk_skip_over_T32_if_cond_is_false(condT);
+         // now uncond
+
+         if (L)
+            mk_neon_elem_load_to_one_lane(rD, inc, i, N, size, addr);
+         else
+            mk_neon_elem_store_from_one_lane(rD, inc, i, N, size, addr);
+         DIP("v%s%u.%u {", L ? "ld" : "st", N + 1, 8 << size);
+         for (j = 0; j <= N; j++) {
+            if (j)
+               DIP(", ");
+            DIP("d%u[%u]", rD + j * inc, i);
+         }
+         DIP("}, [r%u]", rN);
+         if (rM != 13 && rM != 15) {
+            DIP(", r%u\n", rM);
+         } else {
+            DIP("%s\n", (rM != 15) ? "!" : "");
+         }
+      } else {
+         /* VLDn (single element to all lanes) */
+         UInt r;
+         if (L == 0)
+            return False;
+
+         inc = INSN(5,5) + 1;
+         size = INSN(7,6);
+
+         /* size == 3 and size == 2 cases differ in alignment constraints */
+         if (size == 3 && N == 3 && INSN(4,4) == 1)
+            size = 2;
+
+         if (size == 0 && N == 0 && INSN(4,4) == 1)
+            return False;
+         if (N == 2 && INSN(4,4) == 1)
+            return False;
+         if (size == 3)
+            return False;
+
+         // go uncond
+         if (condT != IRTemp_INVALID)
+            mk_skip_over_T32_if_cond_is_false(condT);
+         // now uncond
+
+         IRTemp addr = newTemp(Ity_I32);
+         assign(addr, mkexpr(initialRn));
+
+         if (N == 0 && INSN(5,5))
+            regs = 2;
+
+         for (r = 0; r < regs; r++) {
+            switch (size) {
+               case 0:
+                  putDRegI64(rD + r, unop(Iop_Dup8x8,
+                                          loadLE(Ity_I8, mkexpr(addr))),
+                             IRTemp_INVALID);
+                  break;
+               case 1:
+                  putDRegI64(rD + r, unop(Iop_Dup16x4,
+                                          loadLE(Ity_I16, mkexpr(addr))),
+                             IRTemp_INVALID);
+                  break;
+               case 2:
+                  putDRegI64(rD + r, unop(Iop_Dup32x2,
+                                          loadLE(Ity_I32, mkexpr(addr))),
+                             IRTemp_INVALID);
+                  break;
+               default:
+                  vassert(0);
+            }
+            for (i = 1; i <= N; i++) {
+               switch (size) {
+                  case 0:
+                     putDRegI64(rD + r + i * inc,
+                                unop(Iop_Dup8x8,
+                                     loadLE(Ity_I8, binop(Iop_Add32,
+                                                          mkexpr(addr),
+                                                          mkU32(i * 1)))),
+                                IRTemp_INVALID);
+                     break;
+                  case 1:
+                     putDRegI64(rD + r + i * inc,
+                                unop(Iop_Dup16x4,
+                                     loadLE(Ity_I16, binop(Iop_Add32,
+                                                           mkexpr(addr),
+                                                           mkU32(i * 2)))),
+                                IRTemp_INVALID);
+                     break;
+                  case 2:
+                     putDRegI64(rD + r + i * inc,
+                                unop(Iop_Dup32x2,
+                                     loadLE(Ity_I32, binop(Iop_Add32,
+                                                           mkexpr(addr),
+                                                           mkU32(i * 4)))),
+                                IRTemp_INVALID);
+                     break;
+                  default:
+                     vassert(0);
+               }
+            }
+         }
+         DIP("vld%u.%u {", N + 1, 8 << size);
+         for (r = 0; r < regs; r++) {
+            for (i = 0; i <= N; i++) {
+               if (i || r)
+                  DIP(", ");
+               DIP("d%u[]", rD + r + i * inc);
+            }
+         }
+         DIP("}, [r%u]", rN);
+         if (rM != 13 && rM != 15) {
+            DIP(", r%u\n", rM);
+         } else {
+            DIP("%s\n", (rM != 15) ? "!" : "");
+         }
+      }
+      /* Writeback.  We're uncond here, so no condT-ing. */
+      if (rM != 15) {
+         if (rM == 13) {
+            IRExpr* e = binop(Iop_Add32,
+                              mkexpr(initialRn),
+                              mkU32((1 << size) * (N + 1)));
+            if (isT)
+               putIRegT(rN, e, IRTemp_INVALID);
+            else
+               putIRegA(rN, e, IRTemp_INVALID, Ijk_Boring);
+         } else {
+            IRExpr* e = binop(Iop_Add32,
+                              mkexpr(initialRn),
+                              mkexpr(initialRm));
+            if (isT)
+               putIRegT(rN, e, IRTemp_INVALID);
+            else
+               putIRegA(rN, e, IRTemp_INVALID, Ijk_Boring);
+         }
+      }
+      return True;
+   } else {
+      IRTemp tmp;
+      UInt r, elems;
+      /* VSTn / VLDn (multiple n-element structures) */
+      if (B == BITS4(0,0,1,0) || B == BITS4(0,1,1,0)
+          || B == BITS4(0,1,1,1) || B == BITS4(1,0,1,0)) {
+         N = 0;
+      } else if (B == BITS4(0,0,1,1) || B == BITS4(1,0,0,0)
+                 || B == BITS4(1,0,0,1)) {
+         N = 1;
+      } else if (B == BITS4(0,1,0,0) || B == BITS4(0,1,0,1)) {
+         N = 2;
+      } else if (B == BITS4(0,0,0,0) || B == BITS4(0,0,0,1)) {
+         N = 3;
+      } else {
+         return False;
+      }
+      inc = (B & 1) + 1;
+      if (N == 1 && B == BITS4(0,0,1,1)) {
+         regs = 2;
+      } else if (N == 0) {
+         if (B == BITS4(1,0,1,0)) {
+            regs = 2;
+         } else if (B == BITS4(0,1,1,0)) {
+            regs = 3;
+         } else if (B == BITS4(0,0,1,0)) {
+            regs = 4;
+         }
+      }
+
+      size = INSN(7,6);
+      if (N == 0 && size == 3)
+         size = 2;
+      if (size == 3)
+         return False;
+
+      elems = 8 / (1 << size);
+
+      // go uncond
+      if (condT != IRTemp_INVALID)
+         mk_skip_over_T32_if_cond_is_false(condT);
+      // now uncond
+
+      IRTemp addr = newTemp(Ity_I32);
+      assign(addr, mkexpr(initialRn));
+
+      for (r = 0; r < regs; r++) {
+         for (i = 0; i < elems; i++) {
+            if (L)
+               mk_neon_elem_load_to_one_lane(rD + r, inc, i, N, size, addr);
+            else
+               mk_neon_elem_store_from_one_lane(rD + r, inc, i, N, size, addr);
+            tmp = newTemp(Ity_I32);
+            assign(tmp, binop(Iop_Add32, mkexpr(addr),
+                                         mkU32((1 << size) * (N + 1))));
+            addr = tmp;
+         }
+      }
+      /* Writeback */
+      if (rM != 15) {
+         if (rM == 13) {
+            IRExpr* e = binop(Iop_Add32,
+                              mkexpr(initialRn),
+                              mkU32(8 * (N + 1) * regs));
+            if (isT)
+               putIRegT(rN, e, IRTemp_INVALID);
+            else
+               putIRegA(rN, e, IRTemp_INVALID, Ijk_Boring);
+         } else {
+            IRExpr* e = binop(Iop_Add32,
+                              mkexpr(initialRn),
+                              mkexpr(initialRm));
+            if (isT)
+               putIRegT(rN, e, IRTemp_INVALID);
+            else
+               putIRegA(rN, e, IRTemp_INVALID, Ijk_Boring);
+         }
+      }
+      DIP("v%s%u.%u {", L ? "ld" : "st", N + 1, 8 << INSN(7,6));
+      if ((inc == 1 && regs * (N + 1) > 1)
+          || (inc == 2 && regs > 1 && N > 0)) {
+         DIP("d%u-d%u", rD, rD + regs * (N + 1) - 1);
+      } else {
+         for (r = 0; r < regs; r++) {
+            for (i = 0; i <= N; i++) {
+               if (i || r)
+                  DIP(", ");
+               DIP("d%u", rD + r + i * inc);
+            }
+         }
+      }
+      DIP("}, [r%u]", rN);
+      if (rM != 13 && rM != 15) {
+         DIP(", r%u\n", rM);
+      } else {
+         DIP("%s\n", (rM != 15) ? "!" : "");
+      }
+      return True;
+   }
+#  undef INSN
+}
+
+
+/*------------------------------------------------------------*/
+/*--- NEON, top level control                              ---*/
+/*------------------------------------------------------------*/
+
+/* Both ARM and Thumb */
+
+/* Translate a NEON instruction.    If successful, returns
+   True and *dres may or may not be updated.  If failure, returns
+   False and doesn't change *dres nor create any IR.
+
+   The Thumb and ARM encodings are similar for the 24 bottom bits, but
+   the top 8 bits are slightly different.  In both cases, the caller
+   must pass the entire 32 bits.  Callers may pass any instruction;
+   this ignores non-NEON ones.
+
+   Caller must supply an IRTemp 'condT' holding the gating condition,
+   or IRTemp_INVALID indicating the insn is always executed.  In ARM
+   code, this must always be IRTemp_INVALID because NEON insns are
+   unconditional for ARM.
+
+   Finally, the caller must indicate whether this occurs in ARM or in
+   Thumb code.
+*/
+static Bool decode_NEON_instruction (
+               /*MOD*/DisResult* dres,
+               UInt              insn32,
+               IRTemp            condT,
+               Bool              isT
+            )
+{
+#  define INSN(_bMax,_bMin)  SLICE_UInt(insn32, (_bMax), (_bMin))
+
+   /* There are two kinds of instruction to deal with: load/store and
+      data processing.  In each case, in ARM mode we merely identify
+      the kind, and pass it on to the relevant sub-handler.  In Thumb
+      mode we identify the kind, swizzle the bits around to make it
+      have the same encoding as in ARM, and hand it on to the
+      sub-handler.
+   */
+
+   /* In ARM mode, NEON instructions can't be conditional. */
+   if (!isT)
+      vassert(condT == IRTemp_INVALID);
+
+   /* Data processing:
+      Thumb: 111U 1111 AAAA Axxx xxxx BBBB CCCC xxxx
+      ARM:   1111 001U AAAA Axxx xxxx BBBB CCCC xxxx
+   */
+   if (!isT && INSN(31,25) == BITS7(1,1,1,1,0,0,1)) {
+      // ARM, DP
+      return dis_neon_data_processing(INSN(31,0), condT);
+   }
+   if (isT && INSN(31,29) == BITS3(1,1,1)
+       && INSN(27,24) == BITS4(1,1,1,1)) {
+      // Thumb, DP
+      UInt reformatted = INSN(23,0);
+      reformatted |= (INSN(28,28) << 24); // U bit
+      reformatted |= (BITS7(1,1,1,1,0,0,1) << 25);
+      return dis_neon_data_processing(reformatted, condT);
+   }
+
+   /* Load/store:
+      Thumb: 1111 1001 AxL0 xxxx xxxx BBBB xxxx xxxx
+      ARM:   1111 0100 AxL0 xxxx xxxx BBBB xxxx xxxx
+   */
+   if (!isT && INSN(31,24) == BITS8(1,1,1,1,0,1,0,0)) {
+      // ARM, memory
+      return dis_neon_elem_or_struct_load(INSN(31,0), isT, condT);
+   }
+   if (isT && INSN(31,24) == BITS8(1,1,1,1,1,0,0,1)) {
+      UInt reformatted = INSN(23,0);
+      reformatted |= (BITS8(1,1,1,1,0,1,0,0) << 24);
+      return dis_neon_elem_or_struct_load(reformatted, isT, condT);
+   }
+
+   /* Doesn't match. */
+   return False;
+
+#  undef INSN
+}
+
+
+/*------------------------------------------------------------*/
+/*--- V6 MEDIA instructions                                ---*/
+/*------------------------------------------------------------*/
+
+/* Both ARM and Thumb */
+
+/* Translate a V6 media instruction.    If successful, returns
+   True and *dres may or may not be updated.  If failure, returns
+   False and doesn't change *dres nor create any IR.
+
+   The Thumb and ARM encodings are completely different.  In Thumb
+   mode, the caller must pass the entire 32 bits.  In ARM mode it must
+   pass the lower 28 bits.  Apart from that, callers may pass any
+   instruction; this function ignores anything it doesn't recognise.
+
+   Caller must supply an IRTemp 'condT' holding the gating condition,
+   or IRTemp_INVALID indicating the insn is always executed.
+
+   Caller must also supply an ARMCondcode 'cond'.  This is only used
+   for debug printing, no other purpose.  For ARM, this is simply the
+   top 4 bits of the original instruction.  For Thumb, the condition
+   is not (really) known until run time, and so ARMCondAL should be
+   passed, only so that printing of these instructions does not show
+   any condition.
+
+   Finally, the caller must indicate whether this occurs in ARM or in
+   Thumb code.
+*/
+static Bool decode_V6MEDIA_instruction (
+               /*MOD*/DisResult* dres,
+               UInt              insnv6m,
+               IRTemp            condT,
+               ARMCondcode       conq,
+               Bool              isT
+            )
+{
+#  define INSNA(_bMax,_bMin)   SLICE_UInt(insnv6m, (_bMax), (_bMin))
+#  define INSNT0(_bMax,_bMin)  SLICE_UInt( ((insnv6m >> 16) & 0xFFFF), \
+                                           (_bMax), (_bMin) )
+#  define INSNT1(_bMax,_bMin)  SLICE_UInt( ((insnv6m >> 0)  & 0xFFFF), \
+                                           (_bMax), (_bMin) )
+   HChar dis_buf[128];
+   dis_buf[0] = 0;
+
+   if (isT) {
+      vassert(conq == ARMCondAL);
+   } else {
+      vassert(INSNA(31,28) == BITS4(0,0,0,0)); // caller's obligation
+      vassert(conq >= ARMCondEQ && conq <= ARMCondAL);
+   }
+
+   /* ----------- smulbb, smulbt, smultb, smultt ----------- */
+   {
+     UInt regD = 99, regM = 99, regN = 99, bitM = 0, bitN = 0;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFB1 && INSNT1(15,12) == BITS4(1,1,1,1)
+            && INSNT1(7,6) == BITS2(0,0)) {
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           regN = INSNT0(3,0);
+           bitM = INSNT1(4,4);
+           bitN = INSNT1(5,5);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (BITS8(0,0,0,1,0,1,1,0) == INSNA(27,20) &&
+            BITS4(0,0,0,0)         == INSNA(15,12) &&
+            BITS4(1,0,0,0)         == (INSNA(7,4) & BITS4(1,0,0,1)) ) {
+           regD = INSNA(19,16);
+           regM = INSNA(11,8);
+           regN = INSNA(3,0);
+           bitM = INSNA(6,6);
+           bitN = INSNA(5,5);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp srcN = newTemp(Ity_I32);
+        IRTemp srcM = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+
+        assign( srcN, binop(Iop_Sar32,
+                            binop(Iop_Shl32,
+                                  isT ? getIRegT(regN) : getIRegA(regN),
+                                  mkU8(bitN ? 0 : 16)), mkU8(16)) );
+        assign( srcM, binop(Iop_Sar32,
+                            binop(Iop_Shl32,
+                                  isT ? getIRegT(regM) : getIRegA(regM),
+                                  mkU8(bitM ? 0 : 16)), mkU8(16)) );
+        assign( res, binop(Iop_Mul32, mkexpr(srcN), mkexpr(srcM)) );
+
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        DIP( "smul%c%c%s r%u, r%u, r%u\n", bitN ? 't' : 'b', bitM ? 't' : 'b',
+             nCC(conq), regD, regN, regM );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------ smulwb<y><c> <Rd>,<Rn>,<Rm> ------------- */
+   /* ------------ smulwt<y><c> <Rd>,<Rn>,<Rm> ------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99, bitM = 0;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFB3 && INSNT1(15,12) == BITS4(1,1,1,1)
+            && INSNT1(7,5) == BITS3(0,0,0)) {
+          regN = INSNT0(3,0);
+          regD = INSNT1(11,8);
+          regM = INSNT1(3,0);
+          bitM = INSNT1(4,4);
+          if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+             gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,0,0,1,0,0,1,0) && 
+            INSNA(15,12) == BITS4(0,0,0,0)         &&
+            (INSNA(7,4) & BITS4(1,0,1,1)) == BITS4(1,0,1,0)) {
+           regD = INSNA(19,16);
+           regN = INSNA(3,0);
+           regM = INSNA(11,8);
+           bitM = INSNA(6,6);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_prod = newTemp(Ity_I64);
+
+        assign( irt_prod, 
+                binop(Iop_MullS32,
+                      isT ? getIRegT(regN) : getIRegA(regN),
+                      binop(Iop_Sar32, 
+                            binop(Iop_Shl32,
+                                  isT ? getIRegT(regM) : getIRegA(regM),
+                                  mkU8(bitM ? 0 : 16)), 
+                            mkU8(16))) );
+
+        IRExpr* ire_result = binop(Iop_Or32, 
+                                   binop( Iop_Shl32, 
+                                          unop(Iop_64HIto32, mkexpr(irt_prod)), 
+                                          mkU8(16) ), 
+                                   binop( Iop_Shr32, 
+                                          unop(Iop_64to32, mkexpr(irt_prod)), 
+                                          mkU8(16) ) );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP("smulw%c%s r%u, r%u, r%u\n",
+            bitM ? 't' : 'b', nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------ pkhbt<c> Rd, Rn, Rm {,LSL #imm} ------------- */
+   /* ------------ pkhtb<c> Rd, Rn, Rm {,ASR #imm} ------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99, imm5 = 99, shift_type = 99;
+     Bool tbform = False;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xEAC 
+            && INSNT1(15,15) == 0 && INSNT1(4,4) == 0) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           imm5 = (INSNT1(14,12) << 2) | INSNT1(7,6);
+           shift_type = (INSNT1(5,5) << 1) | 0;
+           tbform = (INSNT1(5,5) == 0) ? False : True;
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,1,0,0,0) &&
+            INSNA(5,4)   == BITS2(0,1)             &&
+            (INSNA(6,6)  == 0 || INSNA(6,6) == 1) ) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           imm5 = INSNA(11,7);
+           shift_type = (INSNA(6,6) << 1) | 0;
+           tbform = (INSNA(6,6) == 0) ? False : True;
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regM       = newTemp(Ity_I32);
+        IRTemp irt_regM_shift = newTemp(Ity_I32);
+        assign( irt_regM, isT ? getIRegT(regM) : getIRegA(regM) );
+        compute_result_and_C_after_shift_by_imm5(
+           dis_buf, &irt_regM_shift, NULL, irt_regM, shift_type, imm5, regM );
+
+        UInt mask = (tbform == True) ? 0x0000FFFF : 0xFFFF0000;
+        IRExpr* ire_result 
+          = binop( Iop_Or32, 
+                   binop(Iop_And32, mkexpr(irt_regM_shift), mkU32(mask)), 
+                   binop(Iop_And32, isT ? getIRegT(regN) : getIRegA(regN),
+                                    unop(Iop_Not32, mkU32(mask))) );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP( "pkh%s%s r%u, r%u, r%u %s\n", tbform ? "tb" : "bt", 
+             nCC(conq), regD, regN, regM, dis_buf );
+
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ---------- usat<c> <Rd>,#<imm5>,<Rn>{,<shift>} ----------- */
+   {
+     UInt regD = 99, regN = 99, shift_type = 99, imm5 = 99, sat_imm = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,6) == BITS10(1,1,1,1,0,0,1,1,1,0)
+            && INSNT0(4,4) == 0
+            && INSNT1(15,15) == 0 && INSNT1(5,5) == 0) {
+           regD       = INSNT1(11,8);
+           regN       = INSNT0(3,0);
+           shift_type = (INSNT0(5,5) << 1) | 0;
+           imm5       = (INSNT1(14,12) << 2) | INSNT1(7,6);
+           sat_imm    = INSNT1(4,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN))
+              gate = True;
+           if (shift_type == BITS2(1,0) && imm5 == 0)
+              gate = False;
+        }
+     } else {
+        if (INSNA(27,21) == BITS7(0,1,1,0,1,1,1) &&
+            INSNA(5,4)   == BITS2(0,1)) {
+           regD       = INSNA(15,12);
+           regN       = INSNA(3,0);
+           shift_type = (INSNA(6,6) << 1) | 0;
+           imm5       = INSNA(11,7);
+           sat_imm    = INSNA(20,16);
+           if (regD != 15 && regN != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN       = newTemp(Ity_I32);
+        IRTemp irt_regN_shift = newTemp(Ity_I32);
+        IRTemp irt_sat_Q      = newTemp(Ity_I32);
+        IRTemp irt_result     = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        compute_result_and_C_after_shift_by_imm5(
+                dis_buf, &irt_regN_shift, NULL,
+                irt_regN, shift_type, imm5, regN );
+
+        armUnsignedSatQ( &irt_result, &irt_sat_Q, irt_regN_shift, sat_imm );
+        or_into_QFLAG32( mkexpr(irt_sat_Q), condT );
+
+        if (isT)
+           putIRegT( regD, mkexpr(irt_result), condT );
+        else
+           putIRegA( regD, mkexpr(irt_result), condT, Ijk_Boring );
+
+        DIP("usat%s r%u, #0x%04x, %s\n",
+            nCC(conq), regD, imm5, dis_buf);
+        return True;
+     }
+     /* fall through */
+   }
+
+  /* ----------- ssat<c> <Rd>,#<imm5>,<Rn>{,<shift>} ----------- */
+   {
+     UInt regD = 99, regN = 99, shift_type = 99, imm5 = 99, sat_imm = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,6) == BITS10(1,1,1,1,0,0,1,1,0,0)
+            && INSNT0(4,4) == 0
+            && INSNT1(15,15) == 0 && INSNT1(5,5) == 0) {
+           regD       = INSNT1(11,8);
+           regN       = INSNT0(3,0);
+           shift_type = (INSNT0(5,5) << 1) | 0;
+           imm5       = (INSNT1(14,12) << 2) | INSNT1(7,6);
+           sat_imm    = INSNT1(4,0) + 1;
+           if (!isBadRegT(regD) && !isBadRegT(regN))
+              gate = True;
+           if (shift_type == BITS2(1,0) && imm5 == 0)
+              gate = False;
+        }
+     } else {
+        if (INSNA(27,21) == BITS7(0,1,1,0,1,0,1) &&
+            INSNA(5,4)   == BITS2(0,1)) {
+           regD       = INSNA(15,12);
+           regN       = INSNA(3,0);
+           shift_type = (INSNA(6,6) << 1) | 0;
+           imm5       = INSNA(11,7);
+           sat_imm    = INSNA(20,16) + 1;
+           if (regD != 15 && regN != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN       = newTemp(Ity_I32);
+        IRTemp irt_regN_shift = newTemp(Ity_I32);
+        IRTemp irt_sat_Q      = newTemp(Ity_I32);
+        IRTemp irt_result     = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        compute_result_and_C_after_shift_by_imm5(
+                dis_buf, &irt_regN_shift, NULL,
+                irt_regN, shift_type, imm5, regN );
+
+        armSignedSatQ( irt_regN_shift, sat_imm, &irt_result, &irt_sat_Q );
+        or_into_QFLAG32( mkexpr(irt_sat_Q), condT );
+
+        if (isT)
+           putIRegT( regD, mkexpr(irt_result), condT );
+        else
+           putIRegA( regD, mkexpr(irt_result), condT, Ijk_Boring );
+
+        DIP( "ssat%s r%u, #0x%04x, %s\n",
+             nCC(conq), regD, imm5, dis_buf);
+        return True;
+    }
+    /* fall through */
+  }
+
+   /* -------------- usat16<c> <Rd>,#<imm4>,<Rn> --------------- */
+   {
+     UInt regD = 99, regN = 99, sat_imm = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xF3A && (INSNT1(15,0) & 0xF0F0) == 0x0000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           sat_imm = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN))
+              gate = True;
+       }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,1,1,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,0,1,1)) {
+           regD    = INSNA(15,12);
+           regN    = INSNA(3,0);
+           sat_imm = INSNA(19,16);
+           if (regD != 15 && regN != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN    = newTemp(Ity_I32);
+        IRTemp irt_regN_lo = newTemp(Ity_I32);
+        IRTemp irt_regN_hi = newTemp(Ity_I32);
+        IRTemp irt_Q_lo    = newTemp(Ity_I32);
+        IRTemp irt_Q_hi    = newTemp(Ity_I32);
+        IRTemp irt_res_lo  = newTemp(Ity_I32);
+        IRTemp irt_res_hi  = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( irt_regN_lo, binop( Iop_Sar32, 
+                                    binop(Iop_Shl32, mkexpr(irt_regN), mkU8(16)), 
+                                    mkU8(16)) );
+        assign( irt_regN_hi, binop(Iop_Sar32, mkexpr(irt_regN), mkU8(16)) );
+
+        armUnsignedSatQ( &irt_res_lo, &irt_Q_lo, irt_regN_lo, sat_imm );
+        or_into_QFLAG32( mkexpr(irt_Q_lo), condT );
+
+        armUnsignedSatQ( &irt_res_hi, &irt_Q_hi, irt_regN_hi, sat_imm );
+        or_into_QFLAG32( mkexpr(irt_Q_hi), condT );
+
+        IRExpr* ire_result = binop( Iop_Or32, 
+                                    binop(Iop_Shl32, mkexpr(irt_res_hi), mkU8(16)),
+                                    mkexpr(irt_res_lo) );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP( "usat16%s r%u, #0x%04x, r%u\n", nCC(conq), regD, sat_imm, regN );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* -------------- uadd16<c> <Rd>,<Rn>,<Rm> -------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA9 && (INSNT1(15,0) & 0xF0F0) == 0xF040) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,0,1) && 
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,0,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Add16x2, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, binop(Iop_HAdd16Ux2, mkexpr(rNt), mkexpr(rMt)));
+        set_GE_32_10_from_bits_31_15(reso, condT);
+
+        DIP("uadd16%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* -------------- sadd16<c> <Rd>,<Rn>,<Rm> -------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA9 && (INSNT1(15,0) & 0xF0F0) == 0xF000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,0,1) && 
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,0,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Add16x2, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, unop(Iop_Not32,
+                          binop(Iop_HAdd16Sx2, mkexpr(rNt), mkexpr(rMt))));
+        set_GE_32_10_from_bits_31_15(reso, condT);
+
+        DIP("sadd16%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ---------------- usub16<c> <Rd>,<Rn>,<Rm> ---------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAD && (INSNT1(15,0) & 0xF0F0) == 0xF040) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,0,1) && 
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,1,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+             gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Sub16x2, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, unop(Iop_Not32,
+                          binop(Iop_HSub16Ux2, mkexpr(rNt), mkexpr(rMt))));
+        set_GE_32_10_from_bits_31_15(reso, condT);
+
+        DIP("usub16%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* -------------- ssub16<c> <Rd>,<Rn>,<Rm> -------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAD && (INSNT1(15,0) & 0xF0F0) == 0xF000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,0,1) && 
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,1,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Sub16x2, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, unop(Iop_Not32,
+                          binop(Iop_HSub16Sx2, mkexpr(rNt), mkexpr(rMt))));
+        set_GE_32_10_from_bits_31_15(reso, condT);
+
+        DIP("ssub16%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ----------------- uadd8<c> <Rd>,<Rn>,<Rm> ---------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA8 && (INSNT1(15,0) & 0xF0F0) == 0xF040) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,0,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            (INSNA(7,4)  == BITS4(1,0,0,1))) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Add8x4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, binop(Iop_HAdd8Ux4, mkexpr(rNt), mkexpr(rMt)));
+        set_GE_3_2_1_0_from_bits_31_23_15_7(reso, condT);
+
+        DIP("uadd8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------- sadd8<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA8 && (INSNT1(15,0) & 0xF0F0) == 0xF000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,0,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            (INSNA(7,4)  == BITS4(1,0,0,1))) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Add8x4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, unop(Iop_Not32,
+                          binop(Iop_HAdd8Sx4, mkexpr(rNt), mkexpr(rMt))));
+        set_GE_3_2_1_0_from_bits_31_23_15_7(reso, condT);
+
+        DIP("sadd8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------- usub8<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAC && (INSNT1(15,0) & 0xF0F0) == 0xF040) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,0,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            (INSNA(7,4)  == BITS4(1,1,1,1))) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+             gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Sub8x4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, unop(Iop_Not32,
+                          binop(Iop_HSub8Ux4, mkexpr(rNt), mkexpr(rMt))));
+        set_GE_3_2_1_0_from_bits_31_23_15_7(reso, condT);
+
+        DIP("usub8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------- ssub8<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAC && (INSNT1(15,0) & 0xF0F0) == 0xF000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,0,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(1,1,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt  = newTemp(Ity_I32);
+        IRTemp rMt  = newTemp(Ity_I32);
+        IRTemp res  = newTemp(Ity_I32);
+        IRTemp reso = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res, binop(Iop_Sub8x4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res), condT );
+        else
+           putIRegA( regD, mkexpr(res), condT, Ijk_Boring );
+
+        assign(reso, unop(Iop_Not32,
+                          binop(Iop_HSub8Sx4, mkexpr(rNt), mkexpr(rMt))));
+        set_GE_3_2_1_0_from_bits_31_23_15_7(reso, condT);
+
+        DIP("ssub8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------ qadd8<c> <Rd>,<Rn>,<Rm> ------------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA8 && (INSNT1(15,0) & 0xF0F0) == 0xF010) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(1,0,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_QAdd8Sx4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("qadd8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------ qsub8<c> <Rd>,<Rn>,<Rm> ------------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAC && (INSNT1(15,0) & 0xF0F0) == 0xF010) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(1,1,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_QSub8Sx4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("qsub8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------ uqadd8<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA8 && (INSNT1(15,0) & 0xF0F0) == 0xF050) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            (INSNA(7,4)  == BITS4(1,0,0,1))) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_QAdd8Ux4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("uqadd8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------ uqsub8<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAC && (INSNT1(15,0) & 0xF0F0) == 0xF050) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            (INSNA(7,4)  == BITS4(1,1,1,1))) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+             gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_QSub8Ux4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("uqsub8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ----------------- uhadd8<c> <Rd>,<Rn>,<Rm> ------------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA8 && (INSNT1(15,0) & 0xF0F0) == 0xF060) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,1,1,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(1,0,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_HAdd8Ux4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("uhadd8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ----------------- shadd8<c> <Rd>,<Rn>,<Rm> ------------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA8 && (INSNT1(15,0) & 0xF0F0) == 0xF020) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(1,0,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_HAdd8Sx4, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("shadd8%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------ qadd16<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA9 && (INSNT1(15,0) & 0xF0F0) == 0xF010) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,0,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_QAdd16Sx2, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("qadd16%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------ qsub16<c> <Rd>,<Rn>,<Rm> ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+      if (isT) {
+        if (INSNT0(15,4) == 0xFAD && (INSNT1(15,0) & 0xF0F0) == 0xF010) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,1,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+             gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp rNt   = newTemp(Ity_I32);
+        IRTemp rMt   = newTemp(Ity_I32);
+        IRTemp res_q = newTemp(Ity_I32);
+
+        assign( rNt, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( rMt, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign(res_q, binop(Iop_QSub16Sx2, mkexpr(rNt), mkexpr(rMt)));
+        if (isT)
+           putIRegT( regD, mkexpr(res_q), condT );
+        else
+           putIRegA( regD, mkexpr(res_q), condT, Ijk_Boring );
+
+        DIP("qsub16%s r%u, r%u, r%u\n", nCC(conq),regD,regN,regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /////////////////////////////////////////////////////////////////
+   /////////////////////////////////////////////////////////////////
+   /////////////////////////////////////////////////////////////////
+   /////////////////////////////////////////////////////////////////
+   /////////////////////////////////////////////////////////////////
+
+   /* ------------------- qsax<c> <Rd>,<Rn>,<Rm> ------------------- */
+   /* note: the hardware seems to construct the result differently
+      from wot the manual says. */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAE && (INSNT1(15,0) & 0xF0F0) == 0xF010) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,1,0,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN     = newTemp(Ity_I32);
+        IRTemp irt_regM     = newTemp(Ity_I32);
+        IRTemp irt_sum      = newTemp(Ity_I32);
+        IRTemp irt_diff     = newTemp(Ity_I32);
+        IRTemp irt_sum_res  = newTemp(Ity_I32);
+        IRTemp irt_diff_res = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( irt_regM, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign( irt_diff, 
+                binop( Iop_Sub32, 
+                       binop( Iop_Sar32, mkexpr(irt_regN), mkU8(16) ),
+                       binop( Iop_Sar32, 
+                              binop(Iop_Shl32, mkexpr(irt_regM), mkU8(16)), 
+                              mkU8(16) ) ) );
+        armSignedSatQ( irt_diff, 0x10, &irt_diff_res, NULL);
+
+        assign( irt_sum, 
+                binop( Iop_Add32, 
+                       binop( Iop_Sar32, 
+                              binop( Iop_Shl32, mkexpr(irt_regN), mkU8(16) ), 
+                              mkU8(16) ), 
+                       binop( Iop_Sar32, mkexpr(irt_regM), mkU8(16) )) );
+        armSignedSatQ( irt_sum, 0x10, &irt_sum_res, NULL );
+
+        IRExpr* ire_result = binop( Iop_Or32, 
+                                    binop( Iop_Shl32, mkexpr(irt_diff_res), 
+                                           mkU8(16) ), 
+                                    binop( Iop_And32, mkexpr(irt_sum_res), 
+                                           mkU32(0xFFFF)) );
+
+        if (isT) 
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP( "qsax%s r%u, r%u, r%u\n", nCC(conq), regD, regN, regM );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------- qasx<c> <Rd>,<Rn>,<Rm> ------------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAA && (INSNT1(15,0) & 0xF0F0) == 0xF010) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,1,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,0,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN     = newTemp(Ity_I32);
+        IRTemp irt_regM     = newTemp(Ity_I32);
+        IRTemp irt_sum      = newTemp(Ity_I32);
+        IRTemp irt_diff     = newTemp(Ity_I32);
+        IRTemp irt_res_sum  = newTemp(Ity_I32);
+        IRTemp irt_res_diff = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( irt_regM, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign( irt_diff,  
+                binop( Iop_Sub32, 
+                       binop( Iop_Sar32, 
+                              binop( Iop_Shl32, mkexpr(irt_regN), mkU8(16) ), 
+                              mkU8(16) ), 
+                       binop( Iop_Sar32, mkexpr(irt_regM), mkU8(16) ) ) );
+        armSignedSatQ( irt_diff, 0x10, &irt_res_diff, NULL );
+
+        assign( irt_sum, 
+                binop( Iop_Add32, 
+                       binop( Iop_Sar32, mkexpr(irt_regN), mkU8(16) ), 
+                       binop( Iop_Sar32, 
+                              binop( Iop_Shl32, mkexpr(irt_regM), mkU8(16) ), 
+                              mkU8(16) ) ) );
+        armSignedSatQ( irt_sum, 0x10, &irt_res_sum, NULL );
+       
+        IRExpr* ire_result 
+          = binop( Iop_Or32, 
+                   binop( Iop_Shl32, mkexpr(irt_res_sum), mkU8(16) ), 
+                   binop( Iop_And32, mkexpr(irt_res_diff), mkU32(0xFFFF) ) );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP( "qasx%s r%u, r%u, r%u\n", nCC(conq), regD, regN, regM );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------- sasx<c> <Rd>,<Rn>,<Rm> ------------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAA && (INSNT1(15,0) & 0xF0F0) == 0xF000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,0,0,0,1) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(0,0,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN = newTemp(Ity_I32);
+        IRTemp irt_regM = newTemp(Ity_I32);
+        IRTemp irt_sum  = newTemp(Ity_I32);
+        IRTemp irt_diff = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( irt_regM, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        assign( irt_diff,  
+                binop( Iop_Sub32, 
+                       binop( Iop_Sar32, 
+                              binop( Iop_Shl32, mkexpr(irt_regN), mkU8(16) ), 
+                              mkU8(16) ), 
+                       binop( Iop_Sar32, mkexpr(irt_regM), mkU8(16) ) ) );
+
+        assign( irt_sum, 
+                binop( Iop_Add32, 
+                       binop( Iop_Sar32, mkexpr(irt_regN), mkU8(16) ), 
+                       binop( Iop_Sar32, 
+                              binop( Iop_Shl32, mkexpr(irt_regM), mkU8(16) ), 
+                              mkU8(16) ) ) );
+       
+        IRExpr* ire_result 
+          = binop( Iop_Or32, 
+                   binop( Iop_Shl32, mkexpr(irt_sum), mkU8(16) ), 
+                   binop( Iop_And32, mkexpr(irt_diff), mkU32(0xFFFF) ) );
+
+        IRTemp ge10 = newTemp(Ity_I32);
+        assign(ge10, unop(Iop_Not32, mkexpr(irt_diff)));
+        put_GEFLAG32( 0, 31, mkexpr(ge10), condT );
+        put_GEFLAG32( 1, 31, mkexpr(ge10), condT );
+
+        IRTemp ge32 = newTemp(Ity_I32);
+        assign(ge32, unop(Iop_Not32, mkexpr(irt_sum)));
+        put_GEFLAG32( 2, 31, mkexpr(ge32), condT );
+        put_GEFLAG32( 3, 31, mkexpr(ge32), condT );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP( "sasx%s r%u, r%u, r%u\n", nCC(conq), regD, regN, regM );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* --------------- smuad, smuadx<c><Rd>,<Rn>,<Rm> --------------- */
+   /* --------------- smsad, smsadx<c><Rd>,<Rn>,<Rm> --------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99, bitM = 99;
+     Bool gate = False, isAD = False;
+
+     if (isT) {
+        if ((INSNT0(15,4) == 0xFB2 || INSNT0(15,4) == 0xFB4)
+            && (INSNT1(15,0) & 0xF0E0) == 0xF000) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           bitM = INSNT1(4,4);
+           isAD = INSNT0(15,4) == 0xFB2;
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,1,0,0,0,0) &&
+            INSNA(15,12) == BITS4(1,1,1,1)         &&
+            (INSNA(7,4) & BITS4(1,0,0,1)) == BITS4(0,0,0,1) ) {
+           regD = INSNA(19,16);
+           regN = INSNA(3,0);
+           regM = INSNA(11,8);
+           bitM = INSNA(5,5);
+           isAD = INSNA(6,6) == 0;
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN    = newTemp(Ity_I32);
+        IRTemp irt_regM    = newTemp(Ity_I32);
+        IRTemp irt_prod_lo = newTemp(Ity_I32);
+        IRTemp irt_prod_hi = newTemp(Ity_I32);
+        IRTemp tmpM        = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+
+        assign( tmpM, isT ? getIRegT(regM) : getIRegA(regM) );
+        assign( irt_regM, genROR32(tmpM, (bitM & 1) ? 16 : 0) );
+
+        assign( irt_prod_lo, 
+                binop( Iop_Mul32, 
+                       binop( Iop_Sar32, 
+                              binop(Iop_Shl32, mkexpr(irt_regN), mkU8(16)), 
+                              mkU8(16) ), 
+                       binop( Iop_Sar32, 
+                              binop(Iop_Shl32, mkexpr(irt_regM), mkU8(16)), 
+                              mkU8(16) ) ) );
+        assign( irt_prod_hi, binop(Iop_Mul32, 
+                                   binop(Iop_Sar32, mkexpr(irt_regN), mkU8(16)), 
+                                   binop(Iop_Sar32, mkexpr(irt_regM), mkU8(16))) );
+        IRExpr* ire_result 
+           = binop( isAD ? Iop_Add32 : Iop_Sub32,
+                    mkexpr(irt_prod_lo), mkexpr(irt_prod_hi) );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        if (isAD) {
+           or_into_QFLAG32(
+              signed_overflow_after_Add32( ire_result,
+                                           irt_prod_lo, irt_prod_hi ),
+              condT
+           );
+        }
+
+        DIP("smu%cd%s%s r%u, r%u, r%u\n",
+            isAD ? 'a' : 's',
+            bitM ? "x" : "", nCC(conq), regD, regN, regM);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* --------------- smlad{X}<c> <Rd>,<Rn>,<Rm>,<Ra> -------------- */
+   /* --------------- smlsd{X}<c> <Rd>,<Rn>,<Rm>,<Ra> -------------- */
+   {
+     UInt regD = 99, regN = 99, regM = 99, regA = 99, bitM = 99;
+     Bool gate = False, isAD = False;
+
+     if (isT) {
+       if ((INSNT0(15,4) == 0xFB2 || INSNT0(15,4) == 0xFB4)
+           && INSNT1(7,5) == BITS3(0,0,0)) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           regA = INSNT1(15,12);
+           bitM = INSNT1(4,4);
+           isAD = INSNT0(15,4) == 0xFB2;
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM)
+               && !isBadRegT(regA))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,1,0,0,0,0) &&
+            (INSNA(7,4) & BITS4(1,0,0,1)) == BITS4(0,0,0,1)) {
+           regD = INSNA(19,16);
+           regA = INSNA(15,12);
+           regN = INSNA(3,0);
+           regM = INSNA(11,8);
+           bitM = INSNA(5,5);
+           isAD = INSNA(6,6) == 0;
+           if (regD != 15 && regN != 15 && regM != 15 && regA != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN    = newTemp(Ity_I32);
+        IRTemp irt_regM    = newTemp(Ity_I32);
+        IRTemp irt_regA    = newTemp(Ity_I32);
+        IRTemp irt_prod_lo = newTemp(Ity_I32);
+        IRTemp irt_prod_hi = newTemp(Ity_I32);
+        IRTemp irt_sum     = newTemp(Ity_I32);
+        IRTemp tmpM        = newTemp(Ity_I32);
+
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+        assign( irt_regA, isT ? getIRegT(regA) : getIRegA(regA) );
+
+        assign( tmpM, isT ? getIRegT(regM) : getIRegA(regM) );
+        assign( irt_regM, genROR32(tmpM, (bitM & 1) ? 16 : 0) );
+
+        assign( irt_prod_lo, 
+                binop(Iop_Mul32, 
+                      binop(Iop_Sar32, 
+                            binop( Iop_Shl32, mkexpr(irt_regN), mkU8(16) ), 
+                            mkU8(16)), 
+                      binop(Iop_Sar32, 
+                            binop( Iop_Shl32, mkexpr(irt_regM), mkU8(16) ), 
+                            mkU8(16))) );
+        assign( irt_prod_hi, 
+                binop( Iop_Mul32, 
+                       binop( Iop_Sar32, mkexpr(irt_regN), mkU8(16) ), 
+                       binop( Iop_Sar32, mkexpr(irt_regM), mkU8(16) ) ) );
+        assign( irt_sum, binop( isAD ? Iop_Add32 : Iop_Sub32, 
+                                mkexpr(irt_prod_lo), mkexpr(irt_prod_hi) ) );
+
+        IRExpr* ire_result = binop(Iop_Add32, mkexpr(irt_sum), mkexpr(irt_regA));
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        if (isAD) {
+           or_into_QFLAG32(
+              signed_overflow_after_Add32( mkexpr(irt_sum),
+                                           irt_prod_lo, irt_prod_hi ),
+              condT
+           );
+        }
+
+        or_into_QFLAG32(
+           signed_overflow_after_Add32( ire_result, irt_sum, irt_regA ),
+           condT
+        );
+
+        DIP("sml%cd%s%s r%u, r%u, r%u, r%u\n",
+            isAD ? 'a' : 's',
+            bitM ? "x" : "", nCC(conq), regD, regN, regM, regA);
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ----- smlabb, smlabt, smlatb, smlatt <Rd>,<Rn>,<Rm>,<Ra> ----- */
+   {
+     UInt regD = 99, regN = 99, regM = 99, regA = 99, bitM = 99, bitN = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFB1 && INSNT1(7,6) == BITS2(0,0)) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           regA = INSNT1(15,12);
+           bitM = INSNT1(4,4);
+           bitN = INSNT1(5,5);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM)
+               && !isBadRegT(regA))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,0,0,1,0,0,0,0) &&
+            (INSNA(7,4) & BITS4(1,0,0,1)) == BITS4(1,0,0,0)) {
+           regD = INSNA(19,16);
+           regN = INSNA(3,0);
+           regM = INSNA(11,8);
+           regA = INSNA(15,12);
+           bitM = INSNA(6,6);
+           bitN = INSNA(5,5);
+           if (regD != 15 && regN != 15 && regM != 15 && regA != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regA = newTemp(Ity_I32);
+        IRTemp irt_prod = newTemp(Ity_I32);
+
+        assign( irt_prod, 
+                binop(Iop_Mul32, 
+                      binop(Iop_Sar32, 
+                            binop(Iop_Shl32,
+                                  isT ? getIRegT(regN) : getIRegA(regN),
+                                  mkU8(bitN ? 0 : 16)),
+                            mkU8(16)), 
+                      binop(Iop_Sar32, 
+                            binop(Iop_Shl32,
+                                  isT ? getIRegT(regM) : getIRegA(regM),
+                                  mkU8(bitM ? 0 : 16)), 
+                            mkU8(16))) );
+
+        assign( irt_regA, isT ? getIRegT(regA) : getIRegA(regA) );
+
+        IRExpr* ire_result = binop(Iop_Add32, mkexpr(irt_prod), mkexpr(irt_regA));
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        or_into_QFLAG32(
+           signed_overflow_after_Add32( ire_result, irt_prod, irt_regA ),
+           condT
+        );
+
+        DIP( "smla%c%c%s r%u, r%u, r%u, r%u\n", 
+             bitN ? 't' : 'b', bitM ? 't' : 'b', 
+             nCC(conq), regD, regN, regM, regA );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ----- smlawb, smlawt <Rd>,<Rn>,<Rm>,<Ra> ----- */
+   {
+     UInt regD = 99, regN = 99, regM = 99, regA = 99, bitM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFB3 && INSNT1(7,5) == BITS3(0,0,0)) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           regA = INSNT1(15,12);
+           bitM = INSNT1(4,4);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM)
+               && !isBadRegT(regA))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,0,0,1,0,0,1,0) &&
+            (INSNA(7,4) & BITS4(1,0,1,1)) == BITS4(1,0,0,0)) {
+           regD = INSNA(19,16);
+           regN = INSNA(3,0);
+           regM = INSNA(11,8);
+           regA = INSNA(15,12);
+           bitM = INSNA(6,6);
+           if (regD != 15 && regN != 15 && regM != 15 && regA != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regA = newTemp(Ity_I32);
+        IRTemp irt_prod = newTemp(Ity_I64);
+
+        assign( irt_prod, 
+                binop(Iop_MullS32, 
+                      isT ? getIRegT(regN) : getIRegA(regN),
+                      binop(Iop_Sar32, 
+                            binop(Iop_Shl32,
+                                  isT ? getIRegT(regM) : getIRegA(regM),
+                                  mkU8(bitM ? 0 : 16)), 
+                            mkU8(16))) );
+
+        assign( irt_regA, isT ? getIRegT(regA) : getIRegA(regA) );
+
+        IRTemp prod32 = newTemp(Ity_I32);
+        assign(prod32,
+               binop(Iop_Or32,
+                     binop(Iop_Shl32, unop(Iop_64HIto32, mkexpr(irt_prod)), mkU8(16)),
+                     binop(Iop_Shr32, unop(Iop_64to32, mkexpr(irt_prod)), mkU8(16))
+        ));
+
+        IRExpr* ire_result = binop(Iop_Add32, mkexpr(prod32), mkexpr(irt_regA));
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        or_into_QFLAG32(
+           signed_overflow_after_Add32( ire_result, prod32, irt_regA ),
+           condT
+        );
+
+        DIP( "smlaw%c%s r%u, r%u, r%u, r%u\n", 
+             bitM ? 't' : 'b', 
+             nCC(conq), regD, regN, regM, regA );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ------------------- sel<c> <Rd>,<Rn>,<Rm> -------------------- */
+   /* fixme: fix up the test in v6media.c so that we can pass the ge
+      flags as part of the test. */
+   {
+     UInt regD = 99, regN = 99, regM = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFAA && (INSNT1(15,0) & 0xF0F0) == 0xF080) {
+           regN = INSNT0(3,0);
+           regD = INSNT1(11,8);
+           regM = INSNT1(3,0);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,1,0,0,0) &&
+            INSNA(11,8)  == BITS4(1,1,1,1)         &&
+            INSNA(7,4)   == BITS4(1,0,1,1)) {
+           regD = INSNA(15,12);
+           regN = INSNA(19,16);
+           regM = INSNA(3,0);
+           if (regD != 15 && regN != 15 && regM != 15)
+              gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_ge_flag0 = newTemp(Ity_I32);
+        IRTemp irt_ge_flag1 = newTemp(Ity_I32);
+        IRTemp irt_ge_flag2 = newTemp(Ity_I32);
+        IRTemp irt_ge_flag3 = newTemp(Ity_I32);
+
+        assign( irt_ge_flag0, get_GEFLAG32(0) );
+        assign( irt_ge_flag1, get_GEFLAG32(1) );
+        assign( irt_ge_flag2, get_GEFLAG32(2) );
+        assign( irt_ge_flag3, get_GEFLAG32(3) );
+
+        IRExpr* ire_ge_flag0_or 
+          = binop(Iop_Or32, mkexpr(irt_ge_flag0), 
+                  binop(Iop_Sub32, mkU32(0), mkexpr(irt_ge_flag0)));
+        IRExpr* ire_ge_flag1_or 
+          = binop(Iop_Or32, mkexpr(irt_ge_flag1), 
+                  binop(Iop_Sub32, mkU32(0), mkexpr(irt_ge_flag1)));
+        IRExpr* ire_ge_flag2_or 
+          = binop(Iop_Or32, mkexpr(irt_ge_flag2), 
+                  binop(Iop_Sub32, mkU32(0), mkexpr(irt_ge_flag2)));
+        IRExpr* ire_ge_flag3_or 
+          = binop(Iop_Or32, mkexpr(irt_ge_flag3), 
+                  binop(Iop_Sub32, mkU32(0), mkexpr(irt_ge_flag3)));
+
+        IRExpr* ire_ge_flags 
+          = binop( Iop_Or32, 
+                   binop(Iop_Or32, 
+                         binop(Iop_And32, 
+                               binop(Iop_Sar32, ire_ge_flag0_or, mkU8(31)), 
+                               mkU32(0x000000ff)), 
+                         binop(Iop_And32, 
+                               binop(Iop_Sar32, ire_ge_flag1_or, mkU8(31)), 
+                               mkU32(0x0000ff00))), 
+                   binop(Iop_Or32, 
+                         binop(Iop_And32, 
+                               binop(Iop_Sar32, ire_ge_flag2_or, mkU8(31)), 
+                               mkU32(0x00ff0000)), 
+                         binop(Iop_And32, 
+                               binop(Iop_Sar32, ire_ge_flag3_or, mkU8(31)), 
+                               mkU32(0xff000000))) );
+
+        IRExpr* ire_result 
+          = binop(Iop_Or32, 
+                  binop(Iop_And32,
+                        isT ? getIRegT(regN) : getIRegA(regN),
+                        ire_ge_flags ), 
+                  binop(Iop_And32,
+                        isT ? getIRegT(regM) : getIRegA(regM),
+                        unop(Iop_Not32, ire_ge_flags)));
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP("sel%s r%u, r%u, r%u\n", nCC(conq), regD, regN, regM );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ----------------- uxtab16<c> Rd,Rn,Rm{,rot} ------------------ */
+   {
+     UInt regD = 99, regN = 99, regM = 99, rotate = 99;
+     Bool gate = False;
+
+     if (isT) {
+        if (INSNT0(15,4) == 0xFA3 && (INSNT1(15,0) & 0xF0C0) == 0xF080) {
+           regN   = INSNT0(3,0);
+           regD   = INSNT1(11,8);
+           regM   = INSNT1(3,0);
+           rotate = INSNT1(5,4);
+           if (!isBadRegT(regD) && !isBadRegT(regN) && !isBadRegT(regM))
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,0,1,1,0,0) &&
+            INSNA(9,4)   == BITS6(0,0,0,1,1,1) ) {
+           regD   = INSNA(15,12);
+           regN   = INSNA(19,16);
+           regM   = INSNA(3,0);
+           rotate = INSNA(11,10);
+           if (regD != 15 && regN != 15 && regM != 15)
+             gate = True;
+        }
+     }
+
+     if (gate) {
+        IRTemp irt_regN = newTemp(Ity_I32);
+        assign( irt_regN, isT ? getIRegT(regN) : getIRegA(regN) );
+
+        IRTemp irt_regM = newTemp(Ity_I32);
+        assign( irt_regM, isT ? getIRegT(regM) : getIRegA(regM) );
+
+        IRTemp irt_rot = newTemp(Ity_I32);
+        assign( irt_rot, binop(Iop_And32,
+                               genROR32(irt_regM, 8 * rotate),
+                               mkU32(0x00FF00FF)) );
+
+        IRExpr* resLo
+           = binop(Iop_And32,
+                   binop(Iop_Add32, mkexpr(irt_regN), mkexpr(irt_rot)),
+                   mkU32(0x0000FFFF));
+
+        IRExpr* resHi
+           = binop(Iop_Add32, 
+                   binop(Iop_And32, mkexpr(irt_regN), mkU32(0xFFFF0000)),
+                   binop(Iop_And32, mkexpr(irt_rot),  mkU32(0xFFFF0000)));
+
+        IRExpr* ire_result 
+           = binop( Iop_Or32, resHi, resLo );
+
+        if (isT)
+           putIRegT( regD, ire_result, condT );
+        else
+           putIRegA( regD, ire_result, condT, Ijk_Boring );
+
+        DIP( "uxtab16%s r%u, r%u, r%u, ROR #%u\n", 
+             nCC(conq), regD, regN, regM, 8 * rotate );
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* --------------- usad8  Rd,Rn,Rm    ---------------- */
+   /* --------------- usada8 Rd,Rn,Rm,Ra ---------------- */
+   {
+     UInt rD = 99, rN = 99, rM = 99, rA = 99;
+     Bool gate = False;
+
+     if (isT) {
+       if (INSNT0(15,4) == 0xFB7 && INSNT1(7,4) == BITS4(0,0,0,0)) {
+           rN = INSNT0(3,0);
+           rA = INSNT1(15,12);
+           rD = INSNT1(11,8);
+           rM = INSNT1(3,0);
+           if (!isBadRegT(rD) && !isBadRegT(rN) && !isBadRegT(rM) && rA != 13)
+              gate = True;
+        }
+     } else {
+        if (INSNA(27,20) == BITS8(0,1,1,1,1,0,0,0) &&
+            INSNA(7,4)   == BITS4(0,0,0,1) ) {
+           rD = INSNA(19,16);
+           rA = INSNA(15,12);
+           rM = INSNA(11,8);
+           rN = INSNA(3,0);
+           if (rD != 15 && rN != 15 && rM != 15 /* but rA can be 15 */)
+              gate = True;
+        }
+     }
+     /* We allow rA == 15, to denote the usad8 (no accumulator) case. */
+
+     if (gate) {
+        IRExpr* rNe = isT ? getIRegT(rN) : getIRegA(rN);
+        IRExpr* rMe = isT ? getIRegT(rM) : getIRegA(rM);
+        IRExpr* rAe = rA == 15 ? mkU32(0)
+                               : (isT ? getIRegT(rA) : getIRegA(rA)); 
+        IRExpr* res = binop(Iop_Add32,
+                            binop(Iop_Sad8Ux4, rNe, rMe),
+                            rAe);
+        if (isT)
+           putIRegT( rD, res, condT );
+        else
+           putIRegA( rD, res, condT, Ijk_Boring );
+
+        if (rA == 15) {
+           DIP( "usad8%s r%u, r%u, r%u\n", 
+                nCC(conq), rD, rN, rM );
+        } else {
+           DIP( "usada8%s r%u, r%u, r%u, r%u\n", 
+                nCC(conq), rD, rN, rM, rA );
+        }
+        return True;
+     }
+     /* fall through */
+   }
+
+   /* ---------- Doesn't match anything. ---------- */
+   return False;
+
+#  undef INSNA
+#  undef INSNT0
+#  undef INSNT1
+}
+
+
+/*------------------------------------------------------------*/
+/*--- LDMxx/STMxx helper (both ARM and Thumb32)            ---*/
+/*------------------------------------------------------------*/
+
+/* Generate IR for LDMxx and STMxx.  This is complex.  Assumes it's
+   unconditional, so the caller must produce a jump-around before
+   calling this, if the insn is to be conditional.  Caller is
+   responsible for all validation of parameters.  For LDMxx, if PC is
+   amongst the values loaded, caller is also responsible for
+   generating the jump. */
+static void mk_ldm_stm ( Bool arm,     /* True: ARM, False: Thumb */
+                         UInt rN,      /* base reg */
+                         UInt bINC,    /* 1: inc,  0: dec */
+                         UInt bBEFORE, /* 1: inc/dec before, 0: after */
+                         UInt bW,      /* 1: writeback to Rn */
+                         UInt bL,      /* 1: load, 0: store */
+                         UInt regList )
+{
+   Int i, r, m, nRegs;
+
+   /* Get hold of the old Rn value.  We might need to write its value
+      to memory during a store, and if it's also the writeback
+      register then we need to get its value now.  We can't treat it
+      exactly like the other registers we're going to transfer,
+      because for xxMDA and xxMDB writeback forms, the generated IR
+      updates Rn in the guest state before any transfers take place.
+      We have to do this as per comments below, in order that if Rn is
+      the stack pointer then it always has a value is below or equal
+      to any of the transfer addresses.  Ick. */
+   IRTemp oldRnT = newTemp(Ity_I32);
+   assign(oldRnT, arm ? getIRegA(rN) : getIRegT(rN));
+
+   IRTemp anchorT = newTemp(Ity_I32);
+   /* The old (Addison-Wesley) ARM ARM seems to say that LDMxx/STMxx
+      ignore the bottom two bits of the address.  However, Cortex-A8
+      doesn't seem to care.  Hence: */
+   /* No .. don't force alignment .. */
+   /* assign(anchorT, binop(Iop_And32, mkexpr(oldRnT), mkU32(~3U))); */
+   /* Instead, use the potentially misaligned address directly. */
+   assign(anchorT, mkexpr(oldRnT));
+
+   IROp opADDorSUB = bINC ? Iop_Add32 : Iop_Sub32;
+   // bINC == 1:  xxMIA, xxMIB
+   // bINC == 0:  xxMDA, xxMDB
+
+   // For xxMDA and xxMDB, update Rn first if necessary.  We have
+   // to do this first so that, for the common idiom of the transfers
+   // faulting because we're pushing stuff onto a stack and the stack
+   // is growing down onto allocate-on-fault pages (as Valgrind simulates),
+   // we need to have the SP up-to-date "covering" (pointing below) the
+   // transfer area.  For the same reason, if we are doing xxMIA or xxMIB,
+   // do the transfer first, and then update rN afterwards.
+   nRegs = 0;
+   for (i = 0; i < 16; i++) {
+     if ((regList & (1 << i)) != 0)
+         nRegs++;
+   }
+   if (bW == 1 && !bINC) {
+      IRExpr* e = binop(opADDorSUB, mkexpr(oldRnT), mkU32(4*nRegs));
+      if (arm)
+         putIRegA( rN, e, IRTemp_INVALID, Ijk_Boring );
+      else
+         putIRegT( rN, e, IRTemp_INVALID );
+   }
+
+   // Make up a list of the registers to transfer, and their offsets
+   // in memory relative to the anchor.  If the base reg (Rn) is part
+   // of the transfer, then do it last for a load and first for a store.
+   UInt xReg[16], xOff[16];
+   Int  nX = 0;
+   m = 0;
+   for (i = 0; i < 16; i++) {
+      r = bINC ? i : (15-i);
+      if (0 == (regList & (1<<r)))
+         continue;
+      if (bBEFORE)
+         m++;
+      /* paranoia: check we aren't transferring the writeback
+         register during a load. Should be assured by decode-point
+         check above. */
+      if (bW == 1 && bL == 1)
+         vassert(r != rN);
+
+      xOff[nX] = 4 * m;
+      xReg[nX] = r;
+      nX++;
+
+      if (!bBEFORE)
+         m++;
+   }
+   vassert(m == nRegs);
+   vassert(nX == nRegs);
+   vassert(nX <= 16);
+
+   if (bW == 0 && (regList & (1<<rN)) != 0) {
+      /* Non-writeback, and basereg is to be transferred.  Do its
+         transfer last for a load and first for a store.  Requires
+         reordering xOff/xReg. */
+      if (0) {
+         vex_printf("\nREG_LIST_PRE: (rN=%d)\n", rN);
+         for (i = 0; i < nX; i++)
+            vex_printf("reg %d   off %d\n", xReg[i], xOff[i]);
+         vex_printf("\n");
+      }
+
+      vassert(nX > 0);
+      for (i = 0; i < nX; i++) {
+         if (xReg[i] == rN)
+             break;
+      }
+      vassert(i < nX); /* else we didn't find it! */
+      UInt tReg = xReg[i];
+      UInt tOff = xOff[i];
+      if (bL == 1) {
+         /* load; make this transfer happen last */
+         if (i < nX-1) {
+            for (m = i+1; m < nX; m++) {
+               xReg[m-1] = xReg[m];
+               xOff[m-1] = xOff[m];
+            }
+            vassert(m == nX);
+            xReg[m-1] = tReg;
+            xOff[m-1] = tOff;
+         }
+      } else {
+         /* store; make this transfer happen first */
+         if (i > 0) {
+            for (m = i-1; m >= 0; m--) {
+               xReg[m+1] = xReg[m];
+               xOff[m+1] = xOff[m];
+            }
+            vassert(m == -1);
+            xReg[0] = tReg;
+            xOff[0] = tOff;
+         }
+      }
+
+      if (0) {
+         vex_printf("REG_LIST_POST:\n");
+         for (i = 0; i < nX; i++)
+            vex_printf("reg %d   off %d\n", xReg[i], xOff[i]);
+         vex_printf("\n");
+      }
+   }
+
+   /* Actually generate the transfers */
+   for (i = 0; i < nX; i++) {
+      r = xReg[i];
+      if (bL == 1) {
+         IRExpr* e = loadLE(Ity_I32,
+                            binop(opADDorSUB, mkexpr(anchorT),
+                                  mkU32(xOff[i])));
+         if (arm) {
+            putIRegA( r, e, IRTemp_INVALID, Ijk_Ret );
+         } else {
+            // no: putIRegT( r, e, IRTemp_INVALID );
+            // putIRegT refuses to write to R15.  But that might happen.
+            // Since this is uncond, and we need to be able to
+            // write the PC, just use the low level put:
+            llPutIReg( r, e );
+         }
+      } else {
+         /* if we're storing Rn, make sure we use the correct
+            value, as per extensive comments above */
+         storeLE( binop(opADDorSUB, mkexpr(anchorT), mkU32(xOff[i])),
+                  r == rN ? mkexpr(oldRnT) 
+                          : (arm ? getIRegA(r) : getIRegT(r) ) );
+      }
+   }
+
+   // If we are doing xxMIA or xxMIB,
+   // do the transfer first, and then update rN afterwards.
+   if (bW == 1 && bINC) {
+      IRExpr* e = binop(opADDorSUB, mkexpr(oldRnT), mkU32(4*nRegs));
+      if (arm)
+         putIRegA( rN, e, IRTemp_INVALID, Ijk_Boring );
+      else
+         putIRegT( rN, e, IRTemp_INVALID );
+   }
+}
+
+
+/*------------------------------------------------------------*/
+/*--- VFP (CP 10 and 11) instructions                      ---*/
+/*------------------------------------------------------------*/
+
+/* Both ARM and Thumb */
+
+/* Translate a CP10 or CP11 instruction.  If successful, returns
+   True and *dres may or may not be updated.  If failure, returns
+   False and doesn't change *dres nor create any IR.
+
+   The ARM and Thumb encodings are identical for the low 28 bits of
+   the insn (yay!) and that's what the caller must supply, iow, imm28
+   has the top 4 bits masked out.  Caller is responsible for
+   determining whether the masked-out bits are valid for a CP10/11
+   insn.  The rules for the top 4 bits are:
+
+     ARM: 0000 to 1110 allowed, and this is the gating condition.
+     1111 (NV) is not allowed.
+
+     Thumb: must be 1110.  The gating condition is taken from
+     ITSTATE in the normal way.
+
+   Conditionalisation:
+
+   Caller must supply an IRTemp 'condT' holding the gating condition,
+   or IRTemp_INVALID indicating the insn is always executed.
+
+   Caller must also supply an ARMCondcode 'cond'.  This is only used
+   for debug printing, no other purpose.  For ARM, this is simply the
+   top 4 bits of the original instruction.  For Thumb, the condition
+   is not (really) known until run time, and so ARMCondAL should be
+   passed, only so that printing of these instructions does not show
+   any condition.
+
+   Finally, the caller must indicate whether this occurs in ARM or
+   Thumb code.
+*/
+static Bool decode_CP10_CP11_instruction (
+               /*MOD*/DisResult* dres,
+               UInt              insn28,
+               IRTemp            condT,
+               ARMCondcode       conq,
+               Bool              isT
+            )
+{
+#  define INSN(_bMax,_bMin)  SLICE_UInt(insn28, (_bMax), (_bMin))
+
+   vassert(INSN(31,28) == BITS4(0,0,0,0)); // caller's obligation
+
+   if (isT) {
+      vassert(conq == ARMCondAL);
+   } else {
+      vassert(conq >= ARMCondEQ && conq <= ARMCondAL);
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- VFP instructions -- double precision (mostly)         -- */
+   /* ----------------------------------------------------------- */
+
+   /* --------------------- fldmx, fstmx --------------------- */
+   /*
+                                 31   27   23   19 15 11   7   0
+                                         P U WL
+      C4-100, C5-26  1  FSTMX    cond 1100 1000 Rn Dd 1011 offset
+      C4-100, C5-28  2  FSTMIAX  cond 1100 1010 Rn Dd 1011 offset
+      C4-100, C5-30  3  FSTMDBX  cond 1101 0010 Rn Dd 1011 offset
+
+      C4-42, C5-26   1  FLDMX    cond 1100 1001 Rn Dd 1011 offset
+      C4-42, C5-28   2  FLDMIAX  cond 1100 1011 Rn Dd 1011 offset
+      C4-42, C5-30   3  FLDMDBX  cond 1101 0011 Rn Dd 1011 offset
+
+      Regs transferred: Dd .. D(d + (offset-3)/2)
+      offset must be odd, must not imply a reg > 15
+      IA/DB: Rn is changed by (4 + 8 x # regs transferred)
+
+      case coding:
+         1  at-Rn   (access at Rn)
+         2  ia-Rn   (access at Rn, then Rn += 4+8n)
+         3  db-Rn   (Rn -= 4+8n,   then access at Rn)
+   */
+   if (BITS8(1,1,0,0,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,0,0,0,0,0))
+       && INSN(11,8) == BITS4(1,0,1,1)) {
+      UInt bP      = (insn28 >> 24) & 1;
+      UInt bU      = (insn28 >> 23) & 1;
+      UInt bW      = (insn28 >> 21) & 1;
+      UInt bL      = (insn28 >> 20) & 1;
+      UInt offset  = (insn28 >> 0) & 0xFF;
+      UInt rN      = INSN(19,16);
+      UInt dD      = (INSN(22,22) << 4) | INSN(15,12);
+      UInt nRegs   = (offset - 1) / 2;
+      UInt summary = 0;
+      Int  i;
+
+      /**/ if (bP == 0 && bU == 1 && bW == 0) {
+         summary = 1;
+      }
+      else if (bP == 0 && bU == 1 && bW == 1) {
+         summary = 2;
+      }
+      else if (bP == 1 && bU == 0 && bW == 1) {
+         summary = 3;
+      }
+      else goto after_vfp_fldmx_fstmx;
+
+      /* no writebacks to r15 allowed.  No use of r15 in thumb mode. */
+      if (rN == 15 && (summary == 2 || summary == 3 || isT))
+         goto after_vfp_fldmx_fstmx;
+
+      /* offset must be odd, and specify at least one register */
+      if (0 == (offset & 1) || offset < 3)
+         goto after_vfp_fldmx_fstmx;
+
+      /* can't transfer regs after D15 */
+      if (dD + nRegs - 1 >= 32)
+         goto after_vfp_fldmx_fstmx;
+
+      /* Now, we can't do a conditional load or store, since that very
+         likely will generate an exception.  So we have to take a side
+         exit at this point if the condition is false. */
+      if (condT != IRTemp_INVALID) {
+         if (isT)
+            mk_skip_over_T32_if_cond_is_false( condT );
+         else
+            mk_skip_over_A32_if_cond_is_false( condT );
+         condT = IRTemp_INVALID;
+      }
+      /* Ok, now we're unconditional.  Do the load or store. */
+
+      /* get the old Rn value */
+      IRTemp rnT = newTemp(Ity_I32);
+      assign(rnT, align4if(isT ? getIRegT(rN) : getIRegA(rN),
+                           rN == 15));
+
+      /* make a new value for Rn, post-insn */
+      IRTemp rnTnew = IRTemp_INVALID;
+      if (summary == 2 || summary == 3) {
+         rnTnew = newTemp(Ity_I32);
+         assign(rnTnew, binop(summary == 2 ? Iop_Add32 : Iop_Sub32,
+                              mkexpr(rnT),
+                              mkU32(4 + 8 * nRegs)));
+      }
+
+      /* decide on the base transfer address */
+      IRTemp taT = newTemp(Ity_I32);
+      assign(taT,  summary == 3 ? mkexpr(rnTnew) : mkexpr(rnT));
+
+      /* update Rn if necessary -- in case 3, we're moving it down, so
+         update before any memory reference, in order to keep Memcheck
+         and V's stack-extending logic (on linux) happy */
+      if (summary == 3) {
+         if (isT)
+            putIRegT(rN, mkexpr(rnTnew), IRTemp_INVALID);
+         else
+            putIRegA(rN, mkexpr(rnTnew), IRTemp_INVALID, Ijk_Boring);
+      }
+
+      /* generate the transfers */
+      for (i = 0; i < nRegs; i++) {
+         IRExpr* addr = binop(Iop_Add32, mkexpr(taT), mkU32(8*i));
+         if (bL) {
+            putDReg(dD + i, loadLE(Ity_F64, addr), IRTemp_INVALID);
+         } else {
+            storeLE(addr, getDReg(dD + i));
+         }
+      }
+
+      /* update Rn if necessary -- in case 2, we're moving it up, so
+         update after any memory reference, in order to keep Memcheck
+         and V's stack-extending logic (on linux) happy */
+      if (summary == 2) {
+         if (isT)
+            putIRegT(rN, mkexpr(rnTnew), IRTemp_INVALID);
+         else
+            putIRegA(rN, mkexpr(rnTnew), IRTemp_INVALID, Ijk_Boring);
+      }
+
+      HChar* nm = bL==1 ? "ld" : "st";
+      switch (summary) {
+         case 1:  DIP("f%smx%s r%u, {d%u-d%u}\n", 
+                      nm, nCC(conq), rN, dD, dD + nRegs - 1);
+                  break;
+         case 2:  DIP("f%smiax%s r%u!, {d%u-d%u}\n", 
+                      nm, nCC(conq), rN, dD, dD + nRegs - 1);
+                  break;
+         case 3:  DIP("f%smdbx%s r%u!, {d%u-d%u}\n", 
+                      nm, nCC(conq), rN, dD, dD + nRegs - 1);
+                  break;
+         default: vassert(0);
+      }
+
+      goto decode_success_vfp;
+      /* FIXME alignment constraints? */
+   }
+
+  after_vfp_fldmx_fstmx:
+
+   /* --------------------- fldmd, fstmd --------------------- */
+   /*
+                                 31   27   23   19 15 11   7   0
+                                         P U WL
+      C4-96, C5-26   1  FSTMD    cond 1100 1000 Rn Dd 1011 offset
+      C4-96, C5-28   2  FSTMDIA  cond 1100 1010 Rn Dd 1011 offset
+      C4-96, C5-30   3  FSTMDDB  cond 1101 0010 Rn Dd 1011 offset
+
+      C4-38, C5-26   1  FLDMD    cond 1100 1001 Rn Dd 1011 offset
+      C4-38, C5-28   2  FLDMIAD  cond 1100 1011 Rn Dd 1011 offset
+      C4-38, C5-30   3  FLDMDBD  cond 1101 0011 Rn Dd 1011 offset
+
+      Regs transferred: Dd .. D(d + (offset-2)/2)
+      offset must be even, must not imply a reg > 15
+      IA/DB: Rn is changed by (8 x # regs transferred)
+
+      case coding:
+         1  at-Rn   (access at Rn)
+         2  ia-Rn   (access at Rn, then Rn += 8n)
+         3  db-Rn   (Rn -= 8n,     then access at Rn)
+   */
+   if (BITS8(1,1,0,0,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,0,0,0,0,0))
+       && INSN(11,8) == BITS4(1,0,1,1)) {
+      UInt bP      = (insn28 >> 24) & 1;
+      UInt bU      = (insn28 >> 23) & 1;
+      UInt bW      = (insn28 >> 21) & 1;
+      UInt bL      = (insn28 >> 20) & 1;
+      UInt offset  = (insn28 >> 0) & 0xFF;
+      UInt rN      = INSN(19,16);
+      UInt dD      = (INSN(22,22) << 4) | INSN(15,12);
+      UInt nRegs   = offset / 2;
+      UInt summary = 0;
+      Int  i;
+
+      /**/ if (bP == 0 && bU == 1 && bW == 0) {
+         summary = 1;
+      }
+      else if (bP == 0 && bU == 1 && bW == 1) {
+         summary = 2;
+      }
+      else if (bP == 1 && bU == 0 && bW == 1) {
+         summary = 3;
+      }
+      else goto after_vfp_fldmd_fstmd;
+
+      /* no writebacks to r15 allowed.  No use of r15 in thumb mode. */
+      if (rN == 15 && (summary == 2 || summary == 3 || isT))
+         goto after_vfp_fldmd_fstmd;
+
+      /* offset must be even, and specify at least one register */
+      if (1 == (offset & 1) || offset < 2)
+         goto after_vfp_fldmd_fstmd;
+
+      /* can't transfer regs after D15 */
+      if (dD + nRegs - 1 >= 32)
+         goto after_vfp_fldmd_fstmd;
+
+      /* Now, we can't do a conditional load or store, since that very
+         likely will generate an exception.  So we have to take a side
+         exit at this point if the condition is false. */
+      if (condT != IRTemp_INVALID) {
+         if (isT)
+            mk_skip_over_T32_if_cond_is_false( condT );
+         else
+            mk_skip_over_A32_if_cond_is_false( condT );
+         condT = IRTemp_INVALID;
+      }
+      /* Ok, now we're unconditional.  Do the load or store. */
+
+      /* get the old Rn value */
+      IRTemp rnT = newTemp(Ity_I32);
+      assign(rnT, align4if(isT ? getIRegT(rN) : getIRegA(rN),
+                           rN == 15));
+
+      /* make a new value for Rn, post-insn */
+      IRTemp rnTnew = IRTemp_INVALID;
+      if (summary == 2 || summary == 3) {
+         rnTnew = newTemp(Ity_I32);
+         assign(rnTnew, binop(summary == 2 ? Iop_Add32 : Iop_Sub32,
+                              mkexpr(rnT),
+                              mkU32(8 * nRegs)));
+      }
+
+      /* decide on the base transfer address */
+      IRTemp taT = newTemp(Ity_I32);
+      assign(taT, summary == 3 ? mkexpr(rnTnew) : mkexpr(rnT));
+
+      /* update Rn if necessary -- in case 3, we're moving it down, so
+         update before any memory reference, in order to keep Memcheck
+         and V's stack-extending logic (on linux) happy */
+      if (summary == 3) {
+         if (isT)
+            putIRegT(rN, mkexpr(rnTnew), IRTemp_INVALID);
+         else
+            putIRegA(rN, mkexpr(rnTnew), IRTemp_INVALID, Ijk_Boring);
+      }
+
+      /* generate the transfers */
+      for (i = 0; i < nRegs; i++) {
+         IRExpr* addr = binop(Iop_Add32, mkexpr(taT), mkU32(8*i));
+         if (bL) {
+            putDReg(dD + i, loadLE(Ity_F64, addr), IRTemp_INVALID);
+         } else {
+            storeLE(addr, getDReg(dD + i));
+         }
+      }
+
+      /* update Rn if necessary -- in case 2, we're moving it up, so
+         update after any memory reference, in order to keep Memcheck
+         and V's stack-extending logic (on linux) happy */
+      if (summary == 2) {
+         if (isT)
+            putIRegT(rN, mkexpr(rnTnew), IRTemp_INVALID);
+         else
+            putIRegA(rN, mkexpr(rnTnew), IRTemp_INVALID, Ijk_Boring);
+      }
+
+      HChar* nm = bL==1 ? "ld" : "st";
+      switch (summary) {
+         case 1:  DIP("f%smd%s r%u, {d%u-d%u}\n", 
+                      nm, nCC(conq), rN, dD, dD + nRegs - 1);
+                  break;
+         case 2:  DIP("f%smiad%s r%u!, {d%u-d%u}\n", 
+                      nm, nCC(conq), rN, dD, dD + nRegs - 1);
+                  break;
+         case 3:  DIP("f%smdbd%s r%u!, {d%u-d%u}\n", 
+                      nm, nCC(conq), rN, dD, dD + nRegs - 1);
+                  break;
+         default: vassert(0);
+      }
+
+      goto decode_success_vfp;
+      /* FIXME alignment constraints? */
+   }
+
+  after_vfp_fldmd_fstmd:
+
+   /* ------------------- fmrx, fmxr ------------------- */
+   if (BITS8(1,1,1,0,1,1,1,1) == INSN(27,20)
+       && BITS4(1,0,1,0) == INSN(11,8)
+       && BITS8(0,0,0,1,0,0,0,0) == (insn28 & 0xFF)) {
+      UInt rD  = INSN(15,12);
+      UInt reg = INSN(19,16);
+      if (reg == BITS4(0,0,0,1)) {
+         if (rD == 15) {
+            IRTemp nzcvT = newTemp(Ity_I32);
+            /* When rD is 15, we are copying the top 4 bits of FPSCR
+               into CPSR.  That is, set the flags thunk to COPY and
+               install FPSCR[31:28] as the value to copy. */
+            assign(nzcvT, binop(Iop_And32,
+                                IRExpr_Get(OFFB_FPSCR, Ity_I32),
+                                mkU32(0xF0000000)));
+            setFlags_D1(ARMG_CC_OP_COPY, nzcvT, condT);
+            DIP("fmstat%s\n", nCC(conq));
+         } else {
+            /* Otherwise, merely transfer FPSCR to r0 .. r14. */
+            IRExpr* e = IRExpr_Get(OFFB_FPSCR, Ity_I32);
+            if (isT)
+               putIRegT(rD, e, condT);
+            else
+               putIRegA(rD, e, condT, Ijk_Boring);
+            DIP("fmrx%s r%u, fpscr\n", nCC(conq), rD);
+         }
+         goto decode_success_vfp;
+      }
+      /* fall through */
+   }
+
+   if (BITS8(1,1,1,0,1,1,1,0) == INSN(27,20)
+       && BITS4(1,0,1,0) == INSN(11,8)
+       && BITS8(0,0,0,1,0,0,0,0) == (insn28 & 0xFF)) {
+      UInt rD  = INSN(15,12);
+      UInt reg = INSN(19,16);
+      if (reg == BITS4(0,0,0,1)) {
+         putMiscReg32(OFFB_FPSCR,
+                      isT ? getIRegT(rD) : getIRegA(rD), condT);
+         DIP("fmxr%s fpscr, r%u\n", nCC(conq), rD);
+         goto decode_success_vfp;
+      }
+      /* fall through */
+   }
+
+   /* --------------------- vmov --------------------- */
+   // VMOV dM, rD, rN
+   if (0x0C400B10 == (insn28 & 0x0FF00FD0)) {
+      UInt dM = INSN(3,0) | (INSN(5,5) << 4);
+      UInt rD = INSN(15,12); /* lo32 */
+      UInt rN = INSN(19,16); /* hi32 */
+      if (rD == 15 || rN == 15 || (isT && (rD == 13 || rN == 13))) {
+         /* fall through */
+      } else {
+         putDReg(dM,
+                 unop(Iop_ReinterpI64asF64,
+                      binop(Iop_32HLto64,
+                            isT ? getIRegT(rN) : getIRegA(rN),
+                            isT ? getIRegT(rD) : getIRegA(rD))),
+                 condT);
+         DIP("vmov%s d%u, r%u, r%u\n", nCC(conq), dM, rD, rN);
+         goto decode_success_vfp;
+      }
+      /* fall through */
+   }
+
+   // VMOV rD, rN, dM
+   if (0x0C500B10 == (insn28 & 0x0FF00FD0)) {
+      UInt dM = INSN(3,0) | (INSN(5,5) << 4);
+      UInt rD = INSN(15,12); /* lo32 */
+      UInt rN = INSN(19,16); /* hi32 */
+      if (rD == 15 || rN == 15 || (isT && (rD == 13 || rN == 13))
+          || rD == rN) {
+         /* fall through */
+      } else {
+         IRTemp i64 = newTemp(Ity_I64);
+         assign(i64, unop(Iop_ReinterpF64asI64, getDReg(dM)));
+         IRExpr* hi32 = unop(Iop_64HIto32, mkexpr(i64));
+         IRExpr* lo32 = unop(Iop_64to32,   mkexpr(i64));
+         if (isT) {
+            putIRegT(rN, hi32, condT);
+            putIRegT(rD, lo32, condT);
+         } else {
+            putIRegA(rN, hi32, condT, Ijk_Boring);
+            putIRegA(rD, lo32, condT, Ijk_Boring);
+         }
+         DIP("vmov%s r%u, r%u, d%u\n", nCC(conq), rD, rN, dM);
+         goto decode_success_vfp;
+      }
+      /* fall through */
+   }
+
+   // VMOV sD, sD+1, rN, rM
+   if (0x0C400A10 == (insn28 & 0x0FF00FD0)) {
+      UInt sD = (INSN(3,0) << 1) | INSN(5,5);
+      UInt rN = INSN(15,12);
+      UInt rM = INSN(19,16);
+      if (rM == 15 || rN == 15 || (isT && (rM == 13 || rN == 13))
+          || sD == 31) {
+         /* fall through */
+      } else {
+         putFReg(sD,
+                 unop(Iop_ReinterpI32asF32, isT ? getIRegT(rN) : getIRegA(rN)),
+                 condT);
+         putFReg(sD+1,
+                 unop(Iop_ReinterpI32asF32, isT ? getIRegT(rM) : getIRegA(rM)),
+                 condT);
+         DIP("vmov%s, s%u, s%u, r%u, r%u\n",
+              nCC(conq), sD, sD + 1, rN, rM);
+         goto decode_success_vfp;
+      }
+   }
+
+   // VMOV rN, rM, sD, sD+1
+   if (0x0C500A10 == (insn28 & 0x0FF00FD0)) {
+      UInt sD = (INSN(3,0) << 1) | INSN(5,5);
+      UInt rN = INSN(15,12);
+      UInt rM = INSN(19,16);
+      if (rM == 15 || rN == 15 || (isT && (rM == 13 || rN == 13))
+          || sD == 31 || rN == rM) {
+         /* fall through */
+      } else {
+         IRExpr* res0 = unop(Iop_ReinterpF32asI32, getFReg(sD));
+         IRExpr* res1 = unop(Iop_ReinterpF32asI32, getFReg(sD+1));
+         if (isT) {
+            putIRegT(rN, res0, condT);
+            putIRegT(rM, res1, condT);
+         } else {
+            putIRegA(rN, res0, condT, Ijk_Boring);
+            putIRegA(rM, res1, condT, Ijk_Boring);
+         }
+         DIP("vmov%s, r%u, r%u, s%u, s%u\n",
+             nCC(conq), rN, rM, sD, sD + 1);
+         goto decode_success_vfp;
+      }
+   }
+
+   // VMOV rD[x], rT  (ARM core register to scalar)
+   if (0x0E000B10 == (insn28 & 0x0F900F1F)) {
+      UInt rD  = (INSN(7,7) << 4) | INSN(19,16);
+      UInt rT  = INSN(15,12);
+      UInt opc = (INSN(22,21) << 2) | INSN(6,5);
+      UInt index;
+      if (rT == 15 || (isT && rT == 13)) {
+         /* fall through */
+      } else {
+         if ((opc & BITS4(1,0,0,0)) == BITS4(1,0,0,0)) {
+            index = opc & 7;
+            putDRegI64(rD, triop(Iop_SetElem8x8,
+                                 getDRegI64(rD),
+                                 mkU8(index),
+                                 unop(Iop_32to8,
+                                      isT ? getIRegT(rT) : getIRegA(rT))),
+                           condT);
+            DIP("vmov%s.8 d%u[%u], r%u\n", nCC(conq), rD, index, rT);
+            goto decode_success_vfp;
+         }
+         else if ((opc & BITS4(1,0,0,1)) == BITS4(0,0,0,1)) {
+            index = (opc >> 1) & 3;
+            putDRegI64(rD, triop(Iop_SetElem16x4,
+                                 getDRegI64(rD),
+                                 mkU8(index),
+                                 unop(Iop_32to16,
+                                      isT ? getIRegT(rT) : getIRegA(rT))),
+                           condT);
+            DIP("vmov%s.16 d%u[%u], r%u\n", nCC(conq), rD, index, rT);
+            goto decode_success_vfp;
+         }
+         else if ((opc & BITS4(1,0,1,1)) == BITS4(0,0,0,0)) {
+            index = (opc >> 2) & 1;
+            putDRegI64(rD, triop(Iop_SetElem32x2,
+                                 getDRegI64(rD),
+                                 mkU8(index),
+                                 isT ? getIRegT(rT) : getIRegA(rT)),
+                           condT);
+            DIP("vmov%s.32 d%u[%u], r%u\n", nCC(conq), rD, index, rT);
+            goto decode_success_vfp;
+         } else {
+            /* fall through */
+         }
+      }
+   }
+
+   // VMOV (scalar to ARM core register)
+   // VMOV rT, rD[x]
+   if (0x0E100B10 == (insn28 & 0x0F100F1F)) {
+      UInt rN  = (INSN(7,7) << 4) | INSN(19,16);
+      UInt rT  = INSN(15,12);
+      UInt U   = INSN(23,23);
+      UInt opc = (INSN(22,21) << 2) | INSN(6,5);
+      UInt index;
+      if (rT == 15 || (isT && rT == 13)) {
+         /* fall through */
+      } else {
+         if ((opc & BITS4(1,0,0,0)) == BITS4(1,0,0,0)) {
+            index = opc & 7;
+            IRExpr* e = unop(U ? Iop_8Uto32 : Iop_8Sto32,
+                             binop(Iop_GetElem8x8,
+                                   getDRegI64(rN),
+                                   mkU8(index)));
+            if (isT)
+               putIRegT(rT, e, condT);
+            else
+               putIRegA(rT, e, condT, Ijk_Boring);
+            DIP("vmov%s.%c8 r%u, d%u[%u]\n", nCC(conq), U ? 'u' : 's',
+                  rT, rN, index);
+            goto decode_success_vfp;
+         }
+         else if ((opc & BITS4(1,0,0,1)) == BITS4(0,0,0,1)) {
+            index = (opc >> 1) & 3;
+            IRExpr* e = unop(U ? Iop_16Uto32 : Iop_16Sto32,
+                             binop(Iop_GetElem16x4,
+                                   getDRegI64(rN),
+                                   mkU8(index)));
+            if (isT)
+               putIRegT(rT, e, condT);
+            else
+               putIRegA(rT, e, condT, Ijk_Boring);
+            DIP("vmov%s.%c16 r%u, d%u[%u]\n", nCC(conq), U ? 'u' : 's',
+                  rT, rN, index);
+            goto decode_success_vfp;
+         }
+         else if ((opc & BITS4(1,0,1,1)) == BITS4(0,0,0,0) && U == 0) {
+            index = (opc >> 2) & 1;
+            IRExpr* e = binop(Iop_GetElem32x2, getDRegI64(rN), mkU8(index));
+            if (isT)
+               putIRegT(rT, e, condT);
+            else
+               putIRegA(rT, e, condT, Ijk_Boring);
+            DIP("vmov%s.32 r%u, d%u[%u]\n", nCC(conq), rT, rN, index);
+            goto decode_success_vfp;
+         } else {
+            /* fall through */
+         }
+      }
+   }
+
+   // VMOV.F32 sD, #imm
+   // FCONSTS sD, #imm
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(0,0,0,0) == INSN(7,4) && INSN(11,8) == BITS4(1,0,1,0)) {
+      UInt rD   = (INSN(15,12) << 1) | INSN(22,22);
+      UInt imm8 = (INSN(19,16) << 4) | INSN(3,0);
+      UInt b    = (imm8 >> 6) & 1;
+      UInt imm;
+      imm = (BITS8((imm8 >> 7) & 1,(~b) & 1,b,b,b,b,b,(imm8 >> 5) & 1) << 8)
+             | ((imm8 & 0x1f) << 3);
+      imm <<= 16;
+      putFReg(rD, unop(Iop_ReinterpI32asF32, mkU32(imm)), condT);
+      DIP("fconsts%s s%u #%u", nCC(conq), rD, imm8);
+      goto decode_success_vfp;
+   }
+
+   // VMOV.F64 dD, #imm
+   // FCONSTD dD, #imm
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(0,0,0,0) == INSN(7,4) && INSN(11,8) == BITS4(1,0,1,1)) {
+      UInt rD   = INSN(15,12) | (INSN(22,22) << 4);
+      UInt imm8 = (INSN(19,16) << 4) | INSN(3,0);
+      UInt b    = (imm8 >> 6) & 1;
+      ULong imm;
+      imm = (BITS8((imm8 >> 7) & 1,(~b) & 1,b,b,b,b,b,b) << 8)
+             | BITS8(b,b,0,0,0,0,0,0) | (imm8 & 0x3f);
+      imm <<= 48;
+      putDReg(rD, unop(Iop_ReinterpI64asF64, mkU64(imm)), condT);
+      DIP("fconstd%s d%u #%u", nCC(conq), rD, imm8);
+      goto decode_success_vfp;
+   }
+
+   /* ---------------------- vdup ------------------------- */
+   // VDUP dD, rT
+   // VDUP qD, rT
+   if (BITS8(1,1,1,0,1,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,1,0,0,1))
+       && BITS4(1,0,1,1) == INSN(11,8) && INSN(6,6) == 0 && INSN(4,4) == 1) {
+      UInt rD   = (INSN(7,7) << 4) | INSN(19,16);
+      UInt rT   = INSN(15,12);
+      UInt Q    = INSN(21,21);
+      UInt size = (INSN(22,22) << 1) | INSN(5,5);
+      if (rT == 15 || (isT && rT == 13) || size == 3i || (Q && (rD & 1))) {
+         /* fall through */
+      } else {
+         IRExpr* e = isT ? getIRegT(rT) : getIRegA(rT);
+         if (Q) {
+            rD >>= 1;
+            switch (size) {
+               case 0:
+                  putQReg(rD, unop(Iop_Dup32x4, e), condT);
+                  break;
+               case 1:
+                  putQReg(rD, unop(Iop_Dup16x8, unop(Iop_32to16, e)),
+                              condT);
+                  break;
+               case 2:
+                  putQReg(rD, unop(Iop_Dup8x16, unop(Iop_32to8, e)),
+                              condT);
+                  break;
+               default:
+                  vassert(0);
+            }
+            DIP("vdup.%u q%u, r%u\n", 32 / (1<<size), rD, rT);
+         } else {
+            switch (size) {
+               case 0:
+                  putDRegI64(rD, unop(Iop_Dup32x2, e), condT);
+                  break;
+               case 1:
+                  putDRegI64(rD, unop(Iop_Dup16x4, unop(Iop_32to16, e)),
+                               condT);
+                  break;
+               case 2:
+                  putDRegI64(rD, unop(Iop_Dup8x8, unop(Iop_32to8, e)),
+                               condT);
+                  break;
+               default:
+                  vassert(0);
+            }
+            DIP("vdup.%u d%u, r%u\n", 32 / (1<<size), rD, rT);
+         }
+         goto decode_success_vfp;
+      }
+   }
+
+   /* --------------------- f{ld,st}d --------------------- */
+   // FLDD, FSTD
+   if (BITS8(1,1,0,1,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,0,0,1,0))
+       && BITS4(1,0,1,1) == INSN(11,8)) {
+      UInt dD     = INSN(15,12) | (INSN(22,22) << 4);
+      UInt rN     = INSN(19,16);
+      UInt offset = (insn28 & 0xFF) << 2;
+      UInt bU     = (insn28 >> 23) & 1; /* 1: +offset  0: -offset */
+      UInt bL     = (insn28 >> 20) & 1; /* 1: load  0: store */
+      /* make unconditional */
+      if (condT != IRTemp_INVALID) {
+         if (isT)
+            mk_skip_over_T32_if_cond_is_false( condT );
+         else
+            mk_skip_over_A32_if_cond_is_false( condT );
+         condT = IRTemp_INVALID;
+      }
+      IRTemp ea = newTemp(Ity_I32);
+      assign(ea, binop(bU ? Iop_Add32 : Iop_Sub32,
+                       align4if(isT ? getIRegT(rN) : getIRegA(rN),
+                                rN == 15),
+                       mkU32(offset)));
+      if (bL) {
+         putDReg(dD, loadLE(Ity_F64,mkexpr(ea)), IRTemp_INVALID);
+      } else {
+         storeLE(mkexpr(ea), getDReg(dD));
+      }
+      DIP("f%sd%s d%u, [r%u, %c#%u]\n",
+          bL ? "ld" : "st", nCC(conq), dD, rN,
+          bU ? '+' : '-', offset);
+      goto decode_success_vfp;
+   }
+
+   /* --------------------- dp insns (D) --------------------- */
+   if (BITS8(1,1,1,0,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,0,0,0,0))
+       && BITS4(1,0,1,1) == INSN(11,8)
+       && BITS4(0,0,0,0) == (INSN(7,4) & BITS4(0,0,0,1))) {
+      UInt    dM  = INSN(3,0)   | (INSN(5,5) << 4);       /* argR */
+      UInt    dD  = INSN(15,12) | (INSN(22,22) << 4);   /* dst/acc */
+      UInt    dN  = INSN(19,16) | (INSN(7,7) << 4);     /* argL */
+      UInt    bP  = (insn28 >> 23) & 1;
+      UInt    bQ  = (insn28 >> 21) & 1;
+      UInt    bR  = (insn28 >> 20) & 1;
+      UInt    bS  = (insn28 >> 6) & 1;
+      UInt    opc = (bP << 3) | (bQ << 2) | (bR << 1) | bS;
+      IRExpr* rm  = get_FAKE_roundingmode(); /* XXXROUNDINGFIXME */
+      switch (opc) {
+         case BITS4(0,0,0,0): /* MAC: d + n * m */
+            putDReg(dD, triop(Iop_AddF64, rm,
+                              getDReg(dD),
+                              triop(Iop_MulF64, rm, getDReg(dN),
+                                                    getDReg(dM))),
+                        condT);
+            DIP("fmacd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
+            goto decode_success_vfp;
+         case BITS4(0,0,0,1): /* NMAC: d + -(n * m) */
+            putDReg(dD, triop(Iop_AddF64, rm,
+                              getDReg(dD),
+                              unop(Iop_NegF64,
+                                   triop(Iop_MulF64, rm, getDReg(dN),
+                                                         getDReg(dM)))),
+                        condT);
+            DIP("fnmacd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
+            goto decode_success_vfp;
+         case BITS4(0,0,1,0): /* MSC: - d + n * m */
+            putDReg(dD, triop(Iop_AddF64, rm,
+                              unop(Iop_NegF64, getDReg(dD)),
+                              triop(Iop_MulF64, rm, getDReg(dN),
+                                                    getDReg(dM))),
+                        condT);
+            DIP("fmscd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
+            goto decode_success_vfp;
+         case BITS4(0,0,1,1): /* NMSC: - d + -(n * m) */
+            putDReg(dD, triop(Iop_AddF64, rm,
+                              unop(Iop_NegF64, getDReg(dD)),
+                              unop(Iop_NegF64,
+                                   triop(Iop_MulF64, rm, getDReg(dN),
+                                                         getDReg(dM)))),
+                        condT);
+            DIP("fnmscd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
+            goto decode_success_vfp;
+         case BITS4(0,1,0,0): /* MUL: n * m */
+            putDReg(dD, triop(Iop_MulF64, rm, getDReg(dN), getDReg(dM)),
+                        condT);
+            DIP("fmuld%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
+            goto decode_success_vfp;
+         case BITS4(0,1,0,1): /* NMUL: - n * m */
+            putDReg(dD, unop(Iop_NegF64,
+                             triop(Iop_MulF64, rm, getDReg(dN),
+                                                   getDReg(dM))),
+                    condT);
+            DIP("fnmuld%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
+            goto decode_success_vfp;
+         case BITS4(0,1,1,0): /* ADD: n + m */
+            putDReg(dD, triop(Iop_AddF64, rm, getDReg(dN), getDReg(dM)),
+                        condT);
+            DIP("faddd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
+            goto decode_success_vfp;
+         case BITS4(0,1,1,1): /* SUB: n - m */
+            putDReg(dD, triop(Iop_SubF64, rm, getDReg(dN), getDReg(dM)),
+                        condT);
+            DIP("fsubd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
+            goto decode_success_vfp;
+         case BITS4(1,0,0,0): /* DIV: n / m */
+            putDReg(dD, triop(Iop_DivF64, rm, getDReg(dN), getDReg(dM)),
+                        condT);
+            DIP("fdivd%s d%u, d%u, d%u\n", nCC(conq), dD, dN, dM);
+            goto decode_success_vfp;
+         default:
+            break;
+      }
+   }
+
+   /* --------------------- compares (D) --------------------- */
+   /*          31   27   23   19   15 11   7    3
+                 28   24   20   16 12    8    4    0 
+      FCMPD    cond 1110 1D11 0100 Dd 1011 0100 Dm
+      FCMPED   cond 1110 1D11 0100 Dd 1011 1100 Dm
+      FCMPZD   cond 1110 1D11 0101 Dd 1011 0100 0000
+      FCMPZED  cond 1110 1D11 0101 Dd 1011 1100 0000
+                                 Z         N
+
+      Z=0 Compare Dd vs Dm     and set FPSCR 31:28 accordingly
+      Z=1 Compare Dd vs zero
+
+      N=1 generates Invalid Operation exn if either arg is any kind of NaN
+      N=0 generates Invalid Operation exn if either arg is a signalling NaN
+      (Not that we pay any attention to N here)
+   */
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(0,1,0,0) == (INSN(19,16) & BITS4(1,1,1,0))
+       && BITS4(1,0,1,1) == INSN(11,8)
+       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
+      UInt bZ = (insn28 >> 16) & 1;
+      UInt bN = (insn28 >> 7) & 1;
+      UInt dD = INSN(15,12) | (INSN(22,22) << 4);
+      UInt dM = INSN(3,0) | (INSN(5,5) << 4);
+      if (bZ && INSN(3,0) != 0) {
+         /* does not decode; fall through */
+      } else {
+         IRTemp argL = newTemp(Ity_F64);
+         IRTemp argR = newTemp(Ity_F64);
+         IRTemp irRes = newTemp(Ity_I32);
+         assign(argL, getDReg(dD));
+         assign(argR, bZ ? IRExpr_Const(IRConst_F64i(0)) : getDReg(dM));
+         assign(irRes, binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)));
+
+         IRTemp nzcv     = IRTemp_INVALID;
+         IRTemp oldFPSCR = newTemp(Ity_I32);
+         IRTemp newFPSCR = newTemp(Ity_I32);
+
+         /* This is where the fun starts.  We have to convert 'irRes'
+            from an IR-convention return result (IRCmpF64Result) to an
+            ARM-encoded (N,Z,C,V) group.  The final result is in the
+            bottom 4 bits of 'nzcv'. */
+         /* Map compare result from IR to ARM(nzcv) */
+         /*
+            FP cmp result | IR   | ARM(nzcv)
+            --------------------------------
+            UN              0x45   0011
+            LT              0x01   1000
+            GT              0x00   0010
+            EQ              0x40   0110
+         */
+         nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes);
+
+         /* And update FPSCR accordingly */
+         assign(oldFPSCR, IRExpr_Get(OFFB_FPSCR, Ity_I32));
+         assign(newFPSCR, 
+                binop(Iop_Or32, 
+                      binop(Iop_And32, mkexpr(oldFPSCR), mkU32(0x0FFFFFFF)),
+                      binop(Iop_Shl32, mkexpr(nzcv), mkU8(28))));
+
+         putMiscReg32(OFFB_FPSCR, mkexpr(newFPSCR), condT);
+
+         if (bZ) {
+            DIP("fcmpz%sd%s d%u\n", bN ? "e" : "", nCC(conq), dD);
+         } else {
+            DIP("fcmp%sd%s d%u, d%u\n", bN ? "e" : "", nCC(conq), dD, dM);
+         }
+         goto decode_success_vfp;
+      }
+      /* fall through */
+   }  
+
+   /* --------------------- unary (D) --------------------- */
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(0,0,0,0) == (INSN(19,16) & BITS4(1,1,1,0))
+       && BITS4(1,0,1,1) == INSN(11,8)
+       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
+      UInt dD  = INSN(15,12) | (INSN(22,22) << 4);
+      UInt dM  = INSN(3,0) | (INSN(5,5) << 4);
+      UInt b16 = (insn28 >> 16) & 1;
+      UInt b7  = (insn28 >> 7) & 1;
+      /**/ if (b16 == 0 && b7 == 0) {
+         // FCPYD
+         putDReg(dD, getDReg(dM), condT);
+         DIP("fcpyd%s d%u, d%u\n", nCC(conq), dD, dM);
+         goto decode_success_vfp;
+      }
+      else if (b16 == 0 && b7 == 1) {
+         // FABSD
+         putDReg(dD, unop(Iop_AbsF64, getDReg(dM)), condT);
+         DIP("fabsd%s d%u, d%u\n", nCC(conq), dD, dM);
+         goto decode_success_vfp;
+      }
+      else if (b16 == 1 && b7 == 0) {
+         // FNEGD
+         putDReg(dD, unop(Iop_NegF64, getDReg(dM)), condT);
+         DIP("fnegd%s d%u, d%u\n", nCC(conq), dD, dM);
+         goto decode_success_vfp;
+      }
+      else if (b16 == 1 && b7 == 1) {
+         // FSQRTD
+         IRExpr* rm = get_FAKE_roundingmode(); /* XXXROUNDINGFIXME */
+         putDReg(dD, binop(Iop_SqrtF64, rm, getDReg(dM)), condT);
+         DIP("fsqrtd%s d%u, d%u\n", nCC(conq), dD, dM);
+         goto decode_success_vfp;
+      }
+      else
+         vassert(0);
+
+      /* fall through */
+   }
+
+   /* ----------------- I <-> D conversions ----------------- */
+
+   // F{S,U}ITOD dD, fM
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(1,0,0,0) == (INSN(19,16) & BITS4(1,1,1,1))
+       && BITS4(1,0,1,1) == INSN(11,8)
+       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
+      UInt bM    = (insn28 >> 5) & 1;
+      UInt fM    = (INSN(3,0) << 1) | bM;
+      UInt dD    = INSN(15,12) | (INSN(22,22) << 4);
+      UInt syned = (insn28 >> 7) & 1;
+      if (syned) {
+         // FSITOD
+         putDReg(dD, unop(Iop_I32StoF64,
+                          unop(Iop_ReinterpF32asI32, getFReg(fM))),
+                 condT);
+         DIP("fsitod%s d%u, s%u\n", nCC(conq), dD, fM);
+      } else {
+         // FUITOD
+         putDReg(dD, unop(Iop_I32UtoF64,
+                          unop(Iop_ReinterpF32asI32, getFReg(fM))),
+                 condT);
+         DIP("fuitod%s d%u, s%u\n", nCC(conq), dD, fM);
+      }
+      goto decode_success_vfp;
+   }
+
+   // FTO{S,U}ID fD, dM
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(1,1,0,0) == (INSN(19,16) & BITS4(1,1,1,0))
+       && BITS4(1,0,1,1) == INSN(11,8)
+       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
+      UInt   bD    = (insn28 >> 22) & 1;
+      UInt   fD    = (INSN(15,12) << 1) | bD;
+      UInt   dM    = INSN(3,0) | (INSN(5,5) << 4);
+      UInt   bZ    = (insn28 >> 7) & 1;
+      UInt   syned = (insn28 >> 16) & 1;
+      IRTemp rmode = newTemp(Ity_I32);
+      assign(rmode, bZ ? mkU32(Irrm_ZERO)
+                       : mkexpr(mk_get_IR_rounding_mode()));
+      if (syned) {
+         // FTOSID
+         putFReg(fD, unop(Iop_ReinterpI32asF32,
+                          binop(Iop_F64toI32S, mkexpr(rmode),
+                                getDReg(dM))),
+                 condT);
+         DIP("ftosi%sd%s s%u, d%u\n", bZ ? "z" : "",
+             nCC(conq), fD, dM);
+      } else {
+         // FTOUID
+         putFReg(fD, unop(Iop_ReinterpI32asF32,
+                          binop(Iop_F64toI32U, mkexpr(rmode),
+                                getDReg(dM))),
+                 condT);
+         DIP("ftoui%sd%s s%u, d%u\n", bZ ? "z" : "",
+             nCC(conq), fD, dM);
+      }
+      goto decode_success_vfp;
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- VFP instructions -- single precision                  -- */
+   /* ----------------------------------------------------------- */
+
+   /* --------------------- fldms, fstms --------------------- */
+   /*
+                                 31   27   23   19 15 11   7   0
+                                         P UDWL
+      C4-98, C5-26   1  FSTMD    cond 1100 1x00 Rn Fd 1010 offset
+      C4-98, C5-28   2  FSTMDIA  cond 1100 1x10 Rn Fd 1010 offset
+      C4-98, C5-30   3  FSTMDDB  cond 1101 0x10 Rn Fd 1010 offset
+
+      C4-40, C5-26   1  FLDMD    cond 1100 1x01 Rn Fd 1010 offset
+      C4-40, C5-26   2  FLDMIAD  cond 1100 1x11 Rn Fd 1010 offset
+      C4-40, C5-26   3  FLDMDBD  cond 1101 0x11 Rn Fd 1010 offset
+
+      Regs transferred: F(Fd:D) .. F(Fd:d + offset)
+      offset must not imply a reg > 15
+      IA/DB: Rn is changed by (4 x # regs transferred)
+
+      case coding:
+         1  at-Rn   (access at Rn)
+         2  ia-Rn   (access at Rn, then Rn += 4n)
+         3  db-Rn   (Rn -= 4n,     then access at Rn)
+   */
+   if (BITS8(1,1,0,0,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,0,0,0,0,0))
+       && INSN(11,8) == BITS4(1,0,1,0)) {
+      UInt bP      = (insn28 >> 24) & 1;
+      UInt bU      = (insn28 >> 23) & 1;
+      UInt bW      = (insn28 >> 21) & 1;
+      UInt bL      = (insn28 >> 20) & 1;
+      UInt bD      = (insn28 >> 22) & 1;
+      UInt offset  = (insn28 >> 0) & 0xFF;
+      UInt rN      = INSN(19,16);
+      UInt fD      = (INSN(15,12) << 1) | bD;
+      UInt nRegs   = offset;
+      UInt summary = 0;
+      Int  i;
+
+      /**/ if (bP == 0 && bU == 1 && bW == 0) {
+         summary = 1;
+      }
+      else if (bP == 0 && bU == 1 && bW == 1) {
+         summary = 2;
+      }
+      else if (bP == 1 && bU == 0 && bW == 1) {
+         summary = 3;
+      }
+      else goto after_vfp_fldms_fstms;
+
+      /* no writebacks to r15 allowed.  No use of r15 in thumb mode. */
+      if (rN == 15 && (summary == 2 || summary == 3 || isT))
+         goto after_vfp_fldms_fstms;
+
+      /* offset must specify at least one register */
+      if (offset < 1)
+         goto after_vfp_fldms_fstms;
+
+      /* can't transfer regs after S31 */
+      if (fD + nRegs - 1 >= 32)
+         goto after_vfp_fldms_fstms;
+
+      /* Now, we can't do a conditional load or store, since that very
+         likely will generate an exception.  So we have to take a side
+         exit at this point if the condition is false. */
+      if (condT != IRTemp_INVALID) {
+         if (isT)
+            mk_skip_over_T32_if_cond_is_false( condT );
+         else
+            mk_skip_over_A32_if_cond_is_false( condT );
+         condT = IRTemp_INVALID;
+      }
+      /* Ok, now we're unconditional.  Do the load or store. */
+
+      /* get the old Rn value */
+      IRTemp rnT = newTemp(Ity_I32);
+      assign(rnT, align4if(isT ? getIRegT(rN) : getIRegA(rN),
+                           rN == 15));
+
+      /* make a new value for Rn, post-insn */
+      IRTemp rnTnew = IRTemp_INVALID;
+      if (summary == 2 || summary == 3) {
+         rnTnew = newTemp(Ity_I32);
+         assign(rnTnew, binop(summary == 2 ? Iop_Add32 : Iop_Sub32,
+                              mkexpr(rnT),
+                              mkU32(4 * nRegs)));
+      }
+
+      /* decide on the base transfer address */
+      IRTemp taT = newTemp(Ity_I32);
+      assign(taT, summary == 3 ? mkexpr(rnTnew) : mkexpr(rnT));
+
+      /* update Rn if necessary -- in case 3, we're moving it down, so
+         update before any memory reference, in order to keep Memcheck
+         and V's stack-extending logic (on linux) happy */
+      if (summary == 3) {
+         if (isT)
+            putIRegT(rN, mkexpr(rnTnew), IRTemp_INVALID);
+         else
+            putIRegA(rN, mkexpr(rnTnew), IRTemp_INVALID, Ijk_Boring);
+      }
+
+      /* generate the transfers */
+      for (i = 0; i < nRegs; i++) {
+         IRExpr* addr = binop(Iop_Add32, mkexpr(taT), mkU32(4*i));
+         if (bL) {
+            putFReg(fD + i, loadLE(Ity_F32, addr), IRTemp_INVALID);
+         } else {
+            storeLE(addr, getFReg(fD + i));
+         }
+      }
+
+      /* update Rn if necessary -- in case 2, we're moving it up, so
+         update after any memory reference, in order to keep Memcheck
+         and V's stack-extending logic (on linux) happy */
+      if (summary == 2) {
+         if (isT)
+            putIRegT(rN, mkexpr(rnTnew), IRTemp_INVALID);
+         else
+            putIRegA(rN, mkexpr(rnTnew), IRTemp_INVALID, Ijk_Boring);
+      }
+
+      HChar* nm = bL==1 ? "ld" : "st";
+      switch (summary) {
+         case 1:  DIP("f%sms%s r%u, {s%u-s%u}\n", 
+                      nm, nCC(conq), rN, fD, fD + nRegs - 1);
+                  break;
+         case 2:  DIP("f%smias%s r%u!, {s%u-s%u}\n", 
+                      nm, nCC(conq), rN, fD, fD + nRegs - 1);
+                  break;
+         case 3:  DIP("f%smdbs%s r%u!, {s%u-s%u}\n", 
+                      nm, nCC(conq), rN, fD, fD + nRegs - 1);
+                  break;
+         default: vassert(0);
+      }
+
+      goto decode_success_vfp;
+      /* FIXME alignment constraints? */
+   }
+
+  after_vfp_fldms_fstms:
+
+   /* --------------------- fmsr, fmrs --------------------- */
+   if (BITS8(1,1,1,0,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,1,1,1,0))
+       && BITS4(1,0,1,0) == INSN(11,8)
+       && BITS4(0,0,0,0) == INSN(3,0)
+       && BITS4(0,0,0,1) == (INSN(7,4) & BITS4(0,1,1,1))) {
+      UInt rD  = INSN(15,12);
+      UInt b7  = (insn28 >> 7) & 1;
+      UInt fN  = (INSN(19,16) << 1) | b7;
+      UInt b20 = (insn28 >> 20) & 1;
+      if (rD == 15) {
+         /* fall through */
+         /* Let's assume that no sane person would want to do
+            floating-point transfers to or from the program counter,
+            and simply decline to decode the instruction.  The ARM ARM
+            doesn't seem to explicitly disallow this case, though. */
+      } else {
+         if (b20) {
+            IRExpr* res = unop(Iop_ReinterpF32asI32, getFReg(fN));
+            if (isT)
+               putIRegT(rD, res, condT);
+            else
+               putIRegA(rD, res, condT, Ijk_Boring);
+            DIP("fmrs%s r%u, s%u\n", nCC(conq), rD, fN);
+         } else {
+            putFReg(fN, unop(Iop_ReinterpI32asF32,
+                             isT ? getIRegT(rD) : getIRegA(rD)),
+                        condT);
+            DIP("fmsr%s s%u, r%u\n", nCC(conq), fN, rD);
+         }
+         goto decode_success_vfp;
+      }
+      /* fall through */
+   }
+
+   /* --------------------- f{ld,st}s --------------------- */
+   // FLDS, FSTS
+   if (BITS8(1,1,0,1,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,0,0,1,0))
+       && BITS4(1,0,1,0) == INSN(11,8)) {
+      UInt bD     = (insn28 >> 22) & 1;
+      UInt fD     = (INSN(15,12) << 1) | bD;
+      UInt rN     = INSN(19,16);
+      UInt offset = (insn28 & 0xFF) << 2;
+      UInt bU     = (insn28 >> 23) & 1; /* 1: +offset  0: -offset */
+      UInt bL     = (insn28 >> 20) & 1; /* 1: load  0: store */
+      /* make unconditional */
+      if (condT != IRTemp_INVALID) {
+         if (isT)
+            mk_skip_over_T32_if_cond_is_false( condT );
+         else
+            mk_skip_over_A32_if_cond_is_false( condT );
+         condT = IRTemp_INVALID;
+      }
+      IRTemp ea = newTemp(Ity_I32);
+      assign(ea, binop(bU ? Iop_Add32 : Iop_Sub32,
+                       align4if(isT ? getIRegT(rN) : getIRegA(rN),
+                                rN == 15),
+                       mkU32(offset)));
+      if (bL) {
+         putFReg(fD, loadLE(Ity_F32,mkexpr(ea)), IRTemp_INVALID);
+      } else {
+         storeLE(mkexpr(ea), getFReg(fD));
+      }
+      DIP("f%ss%s s%u, [r%u, %c#%u]\n",
+          bL ? "ld" : "st", nCC(conq), fD, rN,
+          bU ? '+' : '-', offset);
+      goto decode_success_vfp;
+   }
+
+   /* --------------------- dp insns (F) --------------------- */
+   if (BITS8(1,1,1,0,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,0,0,0,0))
+       && BITS4(1,0,1,0) == (INSN(11,8) & BITS4(1,1,1,0))
+       && BITS4(0,0,0,0) == (INSN(7,4) & BITS4(0,0,0,1))) {
+      UInt    bM  = (insn28 >> 5) & 1;
+      UInt    bD  = (insn28 >> 22) & 1;
+      UInt    bN  = (insn28 >> 7) & 1;
+      UInt    fM  = (INSN(3,0) << 1) | bM;   /* argR */
+      UInt    fD  = (INSN(15,12) << 1) | bD; /* dst/acc */
+      UInt    fN  = (INSN(19,16) << 1) | bN; /* argL */
+      UInt    bP  = (insn28 >> 23) & 1;
+      UInt    bQ  = (insn28 >> 21) & 1;
+      UInt    bR  = (insn28 >> 20) & 1;
+      UInt    bS  = (insn28 >> 6) & 1;
+      UInt    opc = (bP << 3) | (bQ << 2) | (bR << 1) | bS;
+      IRExpr* rm  = get_FAKE_roundingmode(); /* XXXROUNDINGFIXME */
+      switch (opc) {
+         case BITS4(0,0,0,0): /* MAC: d + n * m */
+            putFReg(fD, triop(Iop_AddF32, rm,
+                              getFReg(fD),
+                              triop(Iop_MulF32, rm, getFReg(fN), getFReg(fM))),
+                        condT);
+            DIP("fmacs%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
+            goto decode_success_vfp;
+         case BITS4(0,0,0,1): /* NMAC: d + -(n * m) */
+            putFReg(fD, triop(Iop_AddF32, rm,
+                              getFReg(fD),
+                              unop(Iop_NegF32,
+                                   triop(Iop_MulF32, rm, getFReg(fN),
+                                                         getFReg(fM)))),
+                        condT);
+            DIP("fnmacs%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
+            goto decode_success_vfp;
+         case BITS4(0,0,1,0): /* MSC: - d + n * m */
+            putFReg(fD, triop(Iop_AddF32, rm,
+                              unop(Iop_NegF32, getFReg(fD)),
+                              triop(Iop_MulF32, rm, getFReg(fN), getFReg(fM))),
+                        condT);
+            DIP("fmscs%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
+            goto decode_success_vfp;
+         case BITS4(0,0,1,1): /* NMSC: - d + -(n * m) */
+            putFReg(fD, triop(Iop_AddF32, rm,
+                              unop(Iop_NegF32, getFReg(fD)),
+                              unop(Iop_NegF32,
+                                   triop(Iop_MulF32, rm,
+                                                     getFReg(fN),
+                                                    getFReg(fM)))),
+                        condT);
+            DIP("fnmscs%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
+            goto decode_success_vfp;
+         case BITS4(0,1,0,0): /* MUL: n * m */
+            putFReg(fD, triop(Iop_MulF32, rm, getFReg(fN), getFReg(fM)),
+                        condT);
+            DIP("fmuls%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
+            goto decode_success_vfp;
+         case BITS4(0,1,0,1): /* NMUL: - n * m */
+            putFReg(fD, unop(Iop_NegF32,
+                             triop(Iop_MulF32, rm, getFReg(fN),
+                                                   getFReg(fM))),
+                    condT);
+            DIP("fnmuls%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
+            goto decode_success_vfp;
+         case BITS4(0,1,1,0): /* ADD: n + m */
+            putFReg(fD, triop(Iop_AddF32, rm, getFReg(fN), getFReg(fM)),
+                        condT);
+            DIP("fadds%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
+            goto decode_success_vfp;
+         case BITS4(0,1,1,1): /* SUB: n - m */
+            putFReg(fD, triop(Iop_SubF32, rm, getFReg(fN), getFReg(fM)),
+                        condT);
+            DIP("fsubs%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
+            goto decode_success_vfp;
+         case BITS4(1,0,0,0): /* DIV: n / m */
+            putFReg(fD, triop(Iop_DivF32, rm, getFReg(fN), getFReg(fM)),
+                        condT);
+            DIP("fdivs%s s%u, s%u, s%u\n", nCC(conq), fD, fN, fM);
+            goto decode_success_vfp;
+         default:
+            break;
+      }
+   }
+
+   /* --------------------- compares (S) --------------------- */
+   /*          31   27   23   19   15 11   7    3
+                 28   24   20   16 12    8    4    0 
+      FCMPS    cond 1110 1D11 0100 Fd 1010 01M0 Fm
+      FCMPES   cond 1110 1D11 0100 Fd 1010 11M0 Fm
+      FCMPZS   cond 1110 1D11 0101 Fd 1010 0100 0000
+      FCMPZED  cond 1110 1D11 0101 Fd 1010 1100 0000
+                                 Z         N
+
+      Z=0 Compare Fd:D vs Fm:M     and set FPSCR 31:28 accordingly
+      Z=1 Compare Fd:D vs zero
+
+      N=1 generates Invalid Operation exn if either arg is any kind of NaN
+      N=0 generates Invalid Operation exn if either arg is a signalling NaN
+      (Not that we pay any attention to N here)
+   */
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(0,1,0,0) == (INSN(19,16) & BITS4(1,1,1,0))
+       && BITS4(1,0,1,0) == INSN(11,8)
+       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
+      UInt bZ = (insn28 >> 16) & 1;
+      UInt bN = (insn28 >> 7) & 1;
+      UInt bD = (insn28 >> 22) & 1;
+      UInt bM = (insn28 >> 5) & 1;
+      UInt fD = (INSN(15,12) << 1) | bD;
+      UInt fM = (INSN(3,0) << 1) | bM;
+      if (bZ && (INSN(3,0) != 0 || (INSN(7,4) & 3) != 0)) {
+         /* does not decode; fall through */
+      } else {
+         IRTemp argL = newTemp(Ity_F64);
+         IRTemp argR = newTemp(Ity_F64);
+         IRTemp irRes = newTemp(Ity_I32);
+
+         assign(argL, unop(Iop_F32toF64, getFReg(fD)));
+         assign(argR, bZ ? IRExpr_Const(IRConst_F64i(0))
+                         : unop(Iop_F32toF64, getFReg(fM)));
+         assign(irRes, binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)));
+
+         IRTemp nzcv     = IRTemp_INVALID;
+         IRTemp oldFPSCR = newTemp(Ity_I32);
+         IRTemp newFPSCR = newTemp(Ity_I32);
+
+         /* This is where the fun starts.  We have to convert 'irRes'
+            from an IR-convention return result (IRCmpF64Result) to an
+            ARM-encoded (N,Z,C,V) group.  The final result is in the
+            bottom 4 bits of 'nzcv'. */
+         /* Map compare result from IR to ARM(nzcv) */
+         /*
+            FP cmp result | IR   | ARM(nzcv)
+            --------------------------------
+            UN              0x45   0011
+            LT              0x01   1000
+            GT              0x00   0010
+            EQ              0x40   0110
+         */
+         nzcv = mk_convert_IRCmpF64Result_to_NZCV(irRes);
+
+         /* And update FPSCR accordingly */
+         assign(oldFPSCR, IRExpr_Get(OFFB_FPSCR, Ity_I32));
+         assign(newFPSCR, 
+                binop(Iop_Or32, 
+                      binop(Iop_And32, mkexpr(oldFPSCR), mkU32(0x0FFFFFFF)),
+                      binop(Iop_Shl32, mkexpr(nzcv), mkU8(28))));
+
+         putMiscReg32(OFFB_FPSCR, mkexpr(newFPSCR), condT);
+
+         if (bZ) {
+            DIP("fcmpz%ss%s s%u\n", bN ? "e" : "", nCC(conq), fD);
+         } else {
+            DIP("fcmp%ss%s s%u, s%u\n", bN ? "e" : "",
+                nCC(conq), fD, fM);
+         }
+         goto decode_success_vfp;
+      }
+      /* fall through */
+   }  
+
+   /* --------------------- unary (S) --------------------- */
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(0,0,0,0) == (INSN(19,16) & BITS4(1,1,1,0))
+       && BITS4(1,0,1,0) == INSN(11,8)
+       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
+      UInt bD = (insn28 >> 22) & 1;
+      UInt bM = (insn28 >> 5) & 1;
+      UInt fD  = (INSN(15,12) << 1) | bD;
+      UInt fM  = (INSN(3,0) << 1) | bM;
+      UInt b16 = (insn28 >> 16) & 1;
+      UInt b7  = (insn28 >> 7) & 1;
+      /**/ if (b16 == 0 && b7 == 0) {
+         // FCPYS
+         putFReg(fD, getFReg(fM), condT);
+         DIP("fcpys%s s%u, s%u\n", nCC(conq), fD, fM);
+         goto decode_success_vfp;
+      }
+      else if (b16 == 0 && b7 == 1) {
+         // FABSS
+         putFReg(fD, unop(Iop_AbsF32, getFReg(fM)), condT);
+         DIP("fabss%s s%u, s%u\n", nCC(conq), fD, fM);
+         goto decode_success_vfp;
+      }
+      else if (b16 == 1 && b7 == 0) {
+         // FNEGS
+         putFReg(fD, unop(Iop_NegF32, getFReg(fM)), condT);
+         DIP("fnegs%s s%u, s%u\n", nCC(conq), fD, fM);
+         goto decode_success_vfp;
+      }
+      else if (b16 == 1 && b7 == 1) {
+         // FSQRTS
+         IRExpr* rm = get_FAKE_roundingmode(); /* XXXROUNDINGFIXME */
+         putFReg(fD, binop(Iop_SqrtF32, rm, getFReg(fM)), condT);
+         DIP("fsqrts%s s%u, s%u\n", nCC(conq), fD, fM);
+         goto decode_success_vfp;
+      }
+      else
+         vassert(0);
+
+      /* fall through */
+   }
+
+   /* ----------------- I <-> S conversions ----------------- */
+
+   // F{S,U}ITOS fD, fM
+   /* These are more complex than FSITOD/FUITOD.  In the D cases, a 32
+      bit int will always fit within the 53 bit mantissa, so there's
+      no possibility of a loss of precision, but that's obviously not
+      the case here.  Hence this case possibly requires rounding, and
+      so it drags in the current rounding mode. */
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(1,0,0,0) == INSN(19,16)
+       && BITS4(1,0,1,0) == (INSN(11,8) & BITS4(1,1,1,0))
+       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
+      UInt bM    = (insn28 >> 5) & 1;
+      UInt bD    = (insn28 >> 22) & 1;
+      UInt fM    = (INSN(3,0) << 1) | bM;
+      UInt fD    = (INSN(15,12) << 1) | bD;
+      UInt syned = (insn28 >> 7) & 1;
+      IRTemp rmode = newTemp(Ity_I32);
+      assign(rmode, mkexpr(mk_get_IR_rounding_mode()));
+      if (syned) {
+         // FSITOS
+         putFReg(fD, binop(Iop_F64toF32,
+                           mkexpr(rmode),
+                           unop(Iop_I32StoF64,
+                                unop(Iop_ReinterpF32asI32, getFReg(fM)))),
+                 condT);
+         DIP("fsitos%s s%u, s%u\n", nCC(conq), fD, fM);
+      } else {
+         // FUITOS
+         putFReg(fD, binop(Iop_F64toF32,
+                           mkexpr(rmode),
+                           unop(Iop_I32UtoF64,
+                                unop(Iop_ReinterpF32asI32, getFReg(fM)))),
+                 condT);
+         DIP("fuitos%s s%u, s%u\n", nCC(conq), fD, fM);
+      }
+      goto decode_success_vfp;
+   }
+
+   // FTO{S,U}IS fD, fM
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(1,1,0,0) == (INSN(19,16) & BITS4(1,1,1,0))
+       && BITS4(1,0,1,0) == INSN(11,8)
+       && BITS4(0,1,0,0) == (INSN(7,4) & BITS4(0,1,0,1))) {
+      UInt   bM    = (insn28 >> 5) & 1;
+      UInt   bD    = (insn28 >> 22) & 1;
+      UInt   fD    = (INSN(15,12) << 1) | bD;
+      UInt   fM    = (INSN(3,0) << 1) | bM;
+      UInt   bZ    = (insn28 >> 7) & 1;
+      UInt   syned = (insn28 >> 16) & 1;
+      IRTemp rmode = newTemp(Ity_I32);
+      assign(rmode, bZ ? mkU32(Irrm_ZERO)
+                       : mkexpr(mk_get_IR_rounding_mode()));
+      if (syned) {
+         // FTOSIS
+         putFReg(fD, unop(Iop_ReinterpI32asF32,
+                          binop(Iop_F64toI32S, mkexpr(rmode),
+                                unop(Iop_F32toF64, getFReg(fM)))),
+                 condT);
+         DIP("ftosi%ss%s s%u, d%u\n", bZ ? "z" : "",
+             nCC(conq), fD, fM);
+         goto decode_success_vfp;
+      } else {
+         // FTOUIS
+         putFReg(fD, unop(Iop_ReinterpI32asF32,
+                          binop(Iop_F64toI32U, mkexpr(rmode),
+                                unop(Iop_F32toF64, getFReg(fM)))),
+                 condT);
+         DIP("ftoui%ss%s s%u, d%u\n", bZ ? "z" : "",
+             nCC(conq), fD, fM);
+         goto decode_success_vfp;
+      }
+   }
+
+   /* ----------------- S <-> D conversions ----------------- */
+
+   // FCVTDS
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(0,1,1,1) == INSN(19,16)
+       && BITS4(1,0,1,0) == INSN(11,8)
+       && BITS4(1,1,0,0) == (INSN(7,4) & BITS4(1,1,0,1))) {
+      UInt dD = INSN(15,12) | (INSN(22,22) << 4);
+      UInt bM = (insn28 >> 5) & 1;
+      UInt fM = (INSN(3,0) << 1) | bM;
+      putDReg(dD, unop(Iop_F32toF64, getFReg(fM)), condT);
+      DIP("fcvtds%s d%u, s%u\n", nCC(conq), dD, fM);
+      goto decode_success_vfp;
+   }
+
+   // FCVTSD
+   if (BITS8(1,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(0,1,1,1) == INSN(19,16)
+       && BITS4(1,0,1,1) == INSN(11,8)
+       && BITS4(1,1,0,0) == (INSN(7,4) & BITS4(1,1,0,1))) {
+      UInt   bD    = (insn28 >> 22) & 1;
+      UInt   fD    = (INSN(15,12) << 1) | bD;
+      UInt   dM    = INSN(3,0) | (INSN(5,5) << 4);
+      IRTemp rmode = newTemp(Ity_I32);
+      assign(rmode, mkexpr(mk_get_IR_rounding_mode()));
+      putFReg(fD, binop(Iop_F64toF32, mkexpr(rmode), getDReg(dM)),
+                  condT);
+      DIP("fcvtsd%s s%u, d%u\n", nCC(conq), fD, dM);
+      goto decode_success_vfp;
+   }
+
+   /* FAILURE */
+   return False;
+
+  decode_success_vfp:
+   /* Check that any accepted insn really is a CP10 or CP11 insn, iow,
+      assert that we aren't accepting, in this fn, insns that actually
+      should be handled somewhere else. */
+   vassert(INSN(11,9) == BITS3(1,0,1)); // 11:8 = 1010 or 1011
+   return True;  
+
+#  undef INSN
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Instructions in NV (never) space                     ---*/
+/*------------------------------------------------------------*/
+
+/* ARM only */
+/* Translate a NV space instruction.  If successful, returns True and
+   *dres may or may not be updated.  If failure, returns False and
+   doesn't change *dres nor create any IR.
+
+   Note that all NEON instructions (in ARM mode) are handled through
+   here, since they are all in NV space.
+*/
+static Bool decode_NV_instruction ( /*MOD*/DisResult* dres,
+                                    VexArchInfo* archinfo,
+                                    UInt insn )
+{
+#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+#  define INSN_COND          SLICE_UInt(insn, 31, 28)
+
+   HChar dis_buf[128];
+
+   // Should only be called for NV instructions
+   vassert(BITS4(1,1,1,1) == INSN_COND);
+
+   /* ------------------------ pld ------------------------ */
+   if (BITS8(0,1,0,1, 0, 1,0,1) == (INSN(27,20) & BITS8(1,1,1,1,0,1,1,1))
+       && BITS4(1,1,1,1) == INSN(15,12)) {
+      UInt rN    = INSN(19,16);
+      UInt imm12 = INSN(11,0);
+      UInt bU    = INSN(23,23);
+      DIP("pld [r%u, #%c%u]\n", rN, bU ? '+' : '-', imm12);
+      return True;
+   }
+
+   if (BITS8(0,1,1,1, 0, 1,0,1) == (INSN(27,20) & BITS8(1,1,1,1,0,1,1,1))
+       && BITS4(1,1,1,1) == INSN(15,12)
+       && 0 == INSN(4,4)) {
+      UInt rN   = INSN(19,16);
+      UInt rM   = INSN(3,0);
+      UInt imm5 = INSN(11,7);
+      UInt sh2  = INSN(6,5);
+      UInt bU   = INSN(23,23);
+      if (rM != 15) {
+         IRExpr* eaE = mk_EA_reg_plusminus_shifted_reg(rN, bU, rM,
+                                                       sh2, imm5, dis_buf);
+         IRTemp eaT = newTemp(Ity_I32);
+         /* Bind eaE to a temp merely for debugging-vex purposes, so we
+            can check it's a plausible decoding.  It will get removed
+            by iropt a little later on. */
+         vassert(eaE);
+         assign(eaT, eaE);
+         DIP("pld %s\n", dis_buf);
+         return True;
+      }
+      /* fall through */
+   }
+
+   /* ------------------------ pli ------------------------ */
+   if (BITS8(0,1,0,0, 0, 1,0,1) == (INSN(27,20) & BITS8(1,1,1,1,0,1,1,1))
+       && BITS4(1,1,1,1) == INSN(15,12)) {
+      UInt rN    = INSN(19,16);
+      UInt imm12 = INSN(11,0);
+      UInt bU    = INSN(23,23);
+      DIP("pli [r%u, #%c%u]\n", rN, bU ? '+' : '-', imm12);
+      return True;
+   }
+
+   /* --------------------- Interworking branches --------------------- */
+
+   // BLX (1), viz, unconditional branch and link to R15+simm24
+   // and set CPSR.T = 1, that is, switch to Thumb mode
+   if (INSN(31,25) == BITS7(1,1,1,1,1,0,1)) {
+      UInt bitH   = INSN(24,24);
+      Int  uimm24 = INSN(23,0);
+      Int  simm24 = (((uimm24 << 8) >> 8) << 2) + (bitH << 1);
+      /* Now this is a bit tricky.  Since we're decoding an ARM insn,
+         it is implies that CPSR.T == 0.  Hence the current insn's
+         address is guaranteed to be of the form X--(30)--X00.  So, no
+         need to mask any bits off it.  But need to set the lowest bit
+         to 1 to denote we're in Thumb mode after this, since
+         guest_R15T has CPSR.T as the lowest bit.  And we can't chase
+         into the call, so end the block at this point. */
+      UInt dst = guest_R15_curr_instr_notENC + 8 + (simm24 | 1);
+      putIRegA( 14, mkU32(guest_R15_curr_instr_notENC + 4),
+                    IRTemp_INVALID/*because AL*/, Ijk_Boring );
+      irsb->next     = mkU32(dst);
+      irsb->jumpkind = Ijk_Call;
+      dres->whatNext = Dis_StopHere;
+      DIP("blx 0x%x (and switch to Thumb mode)\n", dst - 1);
+      return True;
+   }
+
+   /* ------------------- v7 barrier insns ------------------- */
+   switch (insn) {
+      case 0xF57FF06F: /* ISB */
+         stmt( IRStmt_MBE(Imbe_Fence) );
+         DIP("ISB\n");
+         return True;
+      case 0xF57FF04F: /* DSB */
+         stmt( IRStmt_MBE(Imbe_Fence) );
+         DIP("DSB\n");
+         return True;
+      case 0xF57FF05F: /* DMB */
+         stmt( IRStmt_MBE(Imbe_Fence) );
+         DIP("DMB\n");
+         return True;
+      default:
+         break;
+   }
+
+   /* ------------------- NEON ------------------- */
+   if (archinfo->hwcaps & VEX_HWCAPS_ARM_NEON) {
+      Bool ok_neon = decode_NEON_instruction(
+                        dres, insn, IRTemp_INVALID/*unconditional*/, 
+                        False/*!isT*/
+                     );
+      if (ok_neon)
+         return True;
+   }
+
+   // unrecognised
+   return False;
+
+#  undef INSN_COND
+#  undef INSN
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Disassemble a single ARM instruction                 ---*/
+/*------------------------------------------------------------*/
+
+/* Disassemble a single ARM instruction into IR.  The instruction is
+   located in host memory at guest_instr, and has (decoded) guest IP
+   of guest_R15_curr_instr_notENC, which will have been set before the
+   call here. */
+
+static
+DisResult disInstr_ARM_WRK (
+             Bool         put_IP,
+             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
+             Bool         resteerCisOk,
+             void*        callback_opaque,
+             UChar*       guest_instr,
+             VexArchInfo* archinfo,
+             VexAbiInfo*  abiinfo
+          )
+{
+   // A macro to fish bits out of 'insn'.
+#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))
+#  define INSN_COND          SLICE_UInt(insn, 31, 28)
+
+   DisResult dres;
+   UInt      insn;
+   //Bool      allow_VFP = False;
+   //UInt      hwcaps = archinfo->hwcaps;
+   IRTemp    condT; /* :: Ity_I32 */
+   UInt      summary;
+   HChar     dis_buf[128];  // big enough to hold LDMIA etc text
+
+   /* What insn variants are we supporting today? */
+   //allow_VFP  = (0 != (hwcaps & VEX_HWCAPS_ARM_VFP));
+   // etc etc
+
+   /* Set result defaults. */
+   dres.whatNext   = Dis_Continue;
+   dres.len        = 4;
+   dres.continueAt = 0;
+
+   /* Set default actions for post-insn handling of writes to r15, if
+      required. */
+   r15written = False;
+   r15guard   = IRTemp_INVALID; /* unconditional */
+   r15kind    = Ijk_Boring;
+
+   /* At least this is simple on ARM: insns are all 4 bytes long, and
+      4-aligned.  So just fish the whole thing out of memory right now
+      and have done. */
+   insn = getUIntLittleEndianly( guest_instr );
+
+   if (0) vex_printf("insn: 0x%x\n", insn);
+
+   DIP("\t(arm) 0x%x:  ", (UInt)guest_R15_curr_instr_notENC);
+
+   /* We may be asked to update the guest R15 before going further. */
+   vassert(0 == (guest_R15_curr_instr_notENC & 3));
+   if (put_IP) {
+      llPutIReg( 15, mkU32(guest_R15_curr_instr_notENC) );
+   }
+
+   /* ----------------------------------------------------------- */
+
+   /* Spot "Special" instructions (see comment at top of file). */
+   {
+      UChar* code = (UChar*)guest_instr;
+      /* Spot the 16-byte preamble: 
+
+         e1a0c1ec  mov r12, r12, ROR #3
+         e1a0c6ec  mov r12, r12, ROR #13
+         e1a0ceec  mov r12, r12, ROR #29
+         e1a0c9ec  mov r12, r12, ROR #19
+      */
+      UInt word1 = 0xE1A0C1EC;
+      UInt word2 = 0xE1A0C6EC;
+      UInt word3 = 0xE1A0CEEC;
+      UInt word4 = 0xE1A0C9EC;
+      if (getUIntLittleEndianly(code+ 0) == word1 &&
+          getUIntLittleEndianly(code+ 4) == word2 &&
+          getUIntLittleEndianly(code+ 8) == word3 &&
+          getUIntLittleEndianly(code+12) == word4) {
+         /* Got a "Special" instruction preamble.  Which one is it? */
+         if (getUIntLittleEndianly(code+16) == 0xE18AA00A
+                                               /* orr r10,r10,r10 */) {
+            /* R3 = client_request ( R4 ) */
+            DIP("r3 = client_request ( %%r4 )\n");
+            irsb->next     = mkU32( guest_R15_curr_instr_notENC + 20 );
+            irsb->jumpkind = Ijk_ClientReq;
+            dres.whatNext  = Dis_StopHere;
+            goto decode_success;
+         }
+         else
+         if (getUIntLittleEndianly(code+16) == 0xE18BB00B
+                                               /* orr r11,r11,r11 */) {
+            /* R3 = guest_NRADDR */
+            DIP("r3 = guest_NRADDR\n");
+            dres.len = 20;
+            llPutIReg(3, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
+            goto decode_success;
+         }
+         else
+         if (getUIntLittleEndianly(code+16) == 0xE18CC00C
+                                               /* orr r12,r12,r12 */) {
+            /*  branch-and-link-to-noredir R4 */
+            DIP("branch-and-link-to-noredir r4\n");
+            llPutIReg(14, mkU32( guest_R15_curr_instr_notENC + 20) );
+            irsb->next     = llGetIReg(4);
+            irsb->jumpkind = Ijk_NoRedir;
+            dres.whatNext  = Dis_StopHere;
+            goto decode_success;
+         }
+         /* We don't know what it is.  Set opc1/opc2 so decode_failure
+            can print the insn following the Special-insn preamble. */
+         insn = getUIntLittleEndianly(code+16);
+         goto decode_failure;
+         /*NOTREACHED*/
+      }
+
+   }
+
+   /* ----------------------------------------------------------- */
+
+   /* Main ARM instruction decoder starts here. */
+
+   /* Deal with the condition.  Strategy is to merely generate a
+      condition temporary at this point (or IRTemp_INVALID, meaning
+      unconditional).  We leave it to lower-level instruction decoders
+      to decide whether they can generate straight-line code, or
+      whether they must generate a side exit before the instruction.
+      condT :: Ity_I32 and is always either zero or one. */
+   condT = IRTemp_INVALID;
+   switch ( (ARMCondcode)INSN_COND ) {
+      case ARMCondNV: {
+         // Illegal instruction prior to v5 (see ARM ARM A3-5), but
+         // some cases are acceptable
+         Bool ok = decode_NV_instruction(&dres, archinfo, insn);
+         if (ok)
+            goto decode_success;
+         else
+            goto decode_failure;
+      }
+      case ARMCondAL: // Always executed
+         break;
+      case ARMCondEQ: case ARMCondNE: case ARMCondHS: case ARMCondLO:
+      case ARMCondMI: case ARMCondPL: case ARMCondVS: case ARMCondVC:
+      case ARMCondHI: case ARMCondLS: case ARMCondGE: case ARMCondLT:
+      case ARMCondGT: case ARMCondLE:
+         condT = newTemp(Ity_I32);
+         assign( condT, mk_armg_calculate_condition( INSN_COND ));
+         break;
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- ARMv5 integer instructions                            -- */
+   /* ----------------------------------------------------------- */
+
+   /* ---------------- Data processing ops ------------------- */
+
+   if (0 == (INSN(27,20) & BITS8(1,1,0,0,0,0,0,0))
+       && !(INSN(25,25) == 0 && INSN(7,7) == 1 && INSN(4,4) == 1)) {
+      IRTemp  shop = IRTemp_INVALID; /* shifter operand */
+      IRTemp  shco = IRTemp_INVALID; /* shifter carry out */
+      UInt    rD   = (insn >> 12) & 0xF; /* 15:12 */
+      UInt    rN   = (insn >> 16) & 0xF; /* 19:16 */
+      UInt    bitS = (insn >> 20) & 1; /* 20:20 */
+      IRTemp  rNt  = IRTemp_INVALID;
+      IRTemp  res  = IRTemp_INVALID;
+      IRTemp  oldV = IRTemp_INVALID;
+      IRTemp  oldC = IRTemp_INVALID;
+      HChar*  name = NULL;
+      IROp    op   = Iop_INVALID;
+      Bool    ok;
+
+      switch (INSN(24,21)) {
+
+         /* --------- ADD, SUB, AND, OR --------- */
+         case BITS4(0,1,0,0): /* ADD:  Rd = Rn + shifter_operand */
+            name = "add"; op = Iop_Add32; goto rd_eq_rn_op_SO;
+         case BITS4(0,0,1,0): /* SUB:  Rd = Rn - shifter_operand */
+            name = "sub"; op = Iop_Sub32; goto rd_eq_rn_op_SO;
+         case BITS4(0,0,1,1): /* RSB:  Rd = shifter_operand - Rn */
+            name = "rsb"; op = Iop_Sub32; goto rd_eq_rn_op_SO;
+         case BITS4(0,0,0,0): /* AND:  Rd = Rn & shifter_operand */
+            name = "and"; op = Iop_And32; goto rd_eq_rn_op_SO;
+         case BITS4(1,1,0,0): /* OR:   Rd = Rn | shifter_operand */
+            name = "orr"; op = Iop_Or32; goto rd_eq_rn_op_SO;
+         case BITS4(0,0,0,1): /* EOR:  Rd = Rn ^ shifter_operand */
+            name = "eor"; op = Iop_Xor32; goto rd_eq_rn_op_SO;
+         case BITS4(1,1,1,0): /* BIC:  Rd = Rn & ~shifter_operand */
+            name = "bic"; op = Iop_And32; goto rd_eq_rn_op_SO;
+         rd_eq_rn_op_SO: {
+            Bool isRSB = False;
+            Bool isBIC = False;
+            switch (INSN(24,21)) {
+               case BITS4(0,0,1,1):
+                  vassert(op == Iop_Sub32); isRSB = True; break;
+               case BITS4(1,1,1,0):
+                  vassert(op == Iop_And32); isBIC = True; break;
+               default:
+                  break;
+            }
+            rNt = newTemp(Ity_I32);
+            assign(rNt, getIRegA(rN));
+            ok = mk_shifter_operand(
+                    INSN(25,25), INSN(11,0), 
+                    &shop, bitS ? &shco : NULL, dis_buf
+                 );
+            if (!ok)
+               break;
+            res = newTemp(Ity_I32);
+            // compute the main result
+            if (isRSB) {
+               // reverse-subtract: shifter_operand - Rn
+               vassert(op == Iop_Sub32);
+               assign(res, binop(op, mkexpr(shop), mkexpr(rNt)) );
+            } else if (isBIC) {
+               // andn: shifter_operand & ~Rn
+               vassert(op == Iop_And32);
+               assign(res, binop(op, mkexpr(rNt),
+                                     unop(Iop_Not32, mkexpr(shop))) );
+            } else {
+               // normal: Rn op shifter_operand
+               assign(res, binop(op, mkexpr(rNt), mkexpr(shop)) );
+            }
+            // but don't commit it until after we've finished
+            // all necessary reads from the guest state
+            if (bitS
+                && (op == Iop_And32 || op == Iop_Or32 || op == Iop_Xor32)) {
+               oldV = newTemp(Ity_I32);
+               assign( oldV, mk_armg_calculate_flag_v() );
+            }
+            // can't safely read guest state after here
+            // now safe to put the main result
+            putIRegA( rD, mkexpr(res), condT, Ijk_Boring );
+            // XXXX!! not safe to read any guest state after
+            // this point (I think the code below doesn't do that).
+            if (!bitS)
+               vassert(shco == IRTemp_INVALID);
+            /* Update the flags thunk if necessary */
+            if (bitS) {
+               vassert(shco != IRTemp_INVALID);
+               switch (op) {
+                  case Iop_Add32:
+                     setFlags_D1_D2( ARMG_CC_OP_ADD, rNt, shop, condT );
+                     break;
+                  case Iop_Sub32:
+                     if (isRSB) {
+                        setFlags_D1_D2( ARMG_CC_OP_SUB, shop, rNt, condT );
+                     } else {
+                        setFlags_D1_D2( ARMG_CC_OP_SUB, rNt, shop, condT );
+                     }
+                     break;
+                  case Iop_And32: /* BIC and AND set the flags the same */
+                  case Iop_Or32:
+                  case Iop_Xor32:
+                     // oldV has been read just above
+                     setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC,
+                                        res, shco, oldV, condT );
+                     break;
+                  default:
+                     vassert(0);
+               }
+            }
+            DIP("%s%s%s r%u, r%u, %s\n",
+                name, nCC(INSN_COND), bitS ? "s" : "", rD, rN, dis_buf );
+            goto decode_success;
+         }
+
+         /* --------- MOV, MVN --------- */
+         case BITS4(1,1,0,1):   /* MOV: Rd = shifter_operand */
+         case BITS4(1,1,1,1): { /* MVN: Rd = not(shifter_operand) */
+            Bool isMVN = INSN(24,21) == BITS4(1,1,1,1);
+            if (rN != 0)
+               break; /* rN must be zero */
+            ok = mk_shifter_operand(
+                    INSN(25,25), INSN(11,0), 
+                    &shop, bitS ? &shco : NULL, dis_buf
+                 );
+            if (!ok)
+               break;
+            res = newTemp(Ity_I32);
+            assign( res, isMVN ? unop(Iop_Not32, mkexpr(shop))
+                               : mkexpr(shop) );
+            if (bitS) {
+               vassert(shco != IRTemp_INVALID);
+               oldV = newTemp(Ity_I32);
+               assign( oldV, mk_armg_calculate_flag_v() );
+            } else {
+               vassert(shco == IRTemp_INVALID);
+            }
+            // can't safely read guest state after here
+            putIRegA( rD, mkexpr(res), condT, Ijk_Boring );
+            /* Update the flags thunk if necessary */
+            if (bitS) {
+               setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, 
+                                  res, shco, oldV, condT );
+            }
+            DIP("%s%s%s r%u, %s\n",
+                isMVN ? "mvn" : "mov",
+                nCC(INSN_COND), bitS ? "s" : "", rD, dis_buf );
+            goto decode_success;
+         }
+
+         /* --------- CMP --------- */
+         case BITS4(1,0,1,0):   /* CMP:  (void) Rn - shifter_operand */
+         case BITS4(1,0,1,1): { /* CMN:  (void) Rn + shifter_operand */
+            Bool isCMN = INSN(24,21) == BITS4(1,0,1,1);
+            if (rD != 0)
+               break; /* rD must be zero */
+            if (bitS == 0)
+               break; /* if S (bit 20) is not set, it's not CMP/CMN */
+            rNt = newTemp(Ity_I32);
+            assign(rNt, getIRegA(rN));
+            ok = mk_shifter_operand(
+                    INSN(25,25), INSN(11,0), 
+                    &shop, NULL, dis_buf
+                 );
+            if (!ok)
+               break;
+            // can't safely read guest state after here
+            /* Update the flags thunk. */
+            setFlags_D1_D2( isCMN ? ARMG_CC_OP_ADD : ARMG_CC_OP_SUB,
+                            rNt, shop, condT );
+            DIP("%s%s r%u, %s\n",
+                isCMN ? "cmn" : "cmp",
+                nCC(INSN_COND), rN, dis_buf );
+            goto decode_success;
+         }
+
+         /* --------- TST --------- */
+         case BITS4(1,0,0,0):   /* TST:  (void) Rn & shifter_operand */
+         case BITS4(1,0,0,1): { /* TEQ:  (void) Rn ^ shifter_operand */
+            Bool isTEQ = INSN(24,21) == BITS4(1,0,0,1);
+            if (rD != 0)
+               break; /* rD must be zero */
+            if (bitS == 0)
+               break; /* if S (bit 20) is not set, it's not TST/TEQ */
+            rNt = newTemp(Ity_I32);
+            assign(rNt, getIRegA(rN));
+            ok = mk_shifter_operand(
+                    INSN(25,25), INSN(11,0), 
+                    &shop, &shco, dis_buf
+                 );
+            if (!ok)
+               break;
+            /* Update the flags thunk. */
+            res = newTemp(Ity_I32);
+            assign( res, binop(isTEQ ? Iop_Xor32 : Iop_And32, 
+                               mkexpr(rNt), mkexpr(shop)) );
+            oldV = newTemp(Ity_I32);
+            assign( oldV, mk_armg_calculate_flag_v() );
+            // can't safely read guest state after here
+            setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC,
+                               res, shco, oldV, condT );
+            DIP("%s%s r%u, %s\n",
+                isTEQ ? "teq" : "tst",
+                nCC(INSN_COND), rN, dis_buf );
+            goto decode_success;
+         }
+
+         /* --------- ADC, SBC, RSC --------- */
+         case BITS4(0,1,0,1): /* ADC:  Rd = Rn + shifter_operand + oldC */
+            name = "adc"; goto rd_eq_rn_op_SO_op_oldC;
+         case BITS4(0,1,1,0): /* SBC:  Rd = Rn - shifter_operand - (oldC ^ 1) */
+            name = "sbc"; goto rd_eq_rn_op_SO_op_oldC;
+         case BITS4(0,1,1,1): /* RSC:  Rd = shifter_operand - Rn - (oldC ^ 1) */
+            name = "rsc"; goto rd_eq_rn_op_SO_op_oldC;
+         rd_eq_rn_op_SO_op_oldC: {
+            // FIXME: shco isn't used for anything.  Get rid of it.
+            rNt = newTemp(Ity_I32);
+            assign(rNt, getIRegA(rN));
+            ok = mk_shifter_operand(
+                    INSN(25,25), INSN(11,0), 
+                    &shop, bitS ? &shco : NULL, dis_buf
+                 );
+            if (!ok)
+               break;
+            oldC = newTemp(Ity_I32);
+            assign( oldC, mk_armg_calculate_flag_c() );
+            res = newTemp(Ity_I32);
+            // compute the main result
+            switch (INSN(24,21)) {
+               case BITS4(0,1,0,1): /* ADC */
+                  assign(res,
+                         binop(Iop_Add32,
+                               binop(Iop_Add32, mkexpr(rNt), mkexpr(shop)),
+                               mkexpr(oldC) ));
+                  break;
+               case BITS4(0,1,1,0): /* SBC */
+                  assign(res,
+                         binop(Iop_Sub32,
+                               binop(Iop_Sub32, mkexpr(rNt), mkexpr(shop)),
+                               binop(Iop_Xor32, mkexpr(oldC), mkU32(1)) ));
+                  break;
+               case BITS4(0,1,1,1): /* RSC */
+                  assign(res,
+                         binop(Iop_Sub32,
+                               binop(Iop_Sub32, mkexpr(shop), mkexpr(rNt)),
+                               binop(Iop_Xor32, mkexpr(oldC), mkU32(1)) ));
+                  break;
+               default:
+                  vassert(0);
+            }
+            // but don't commit it until after we've finished
+            // all necessary reads from the guest state
+            // now safe to put the main result
+            putIRegA( rD, mkexpr(res), condT, Ijk_Boring );
+            // XXXX!! not safe to read any guest state after
+            // this point (I think the code below doesn't do that).
+            if (!bitS)
+               vassert(shco == IRTemp_INVALID);
+            /* Update the flags thunk if necessary */
+            if (bitS) {
+               vassert(shco != IRTemp_INVALID);
+               switch (INSN(24,21)) {
+                  case BITS4(0,1,0,1): /* ADC */
+                     setFlags_D1_D2_ND( ARMG_CC_OP_ADC,
+                                        rNt, shop, oldC, condT );
+                     break;
+                  case BITS4(0,1,1,0): /* SBC */
+                     setFlags_D1_D2_ND( ARMG_CC_OP_SBB,
+                                        rNt, shop, oldC, condT );
+                     break;
+                  case BITS4(0,1,1,1): /* RSC */
+                     setFlags_D1_D2_ND( ARMG_CC_OP_SBB,
+                                        shop, rNt, oldC, condT );
+                     break;
+                  default:
+                     vassert(0);
+               }
+            }
+            DIP("%s%s%s r%u, r%u, %s\n",
+                name, nCC(INSN_COND), bitS ? "s" : "", rD, rN, dis_buf );
+            goto decode_success;
+         }
+
+         /* --------- ??? --------- */
+         default:
+            break;
+      }
+   } /* if (0 == (INSN(27,20) & BITS8(1,1,0,0,0,0,0,0)) */
+
+   /* --------------------- Load/store (ubyte & word) -------- */
+   // LDR STR LDRB STRB
+   /*                 31   27   23   19 15 11    6   4 3  # highest bit
+                        28   24   20 16 12
+      A5-20   1 | 16  cond 0101 UB0L Rn Rd imm12
+      A5-22   1 | 32  cond 0111 UBOL Rn Rd imm5  sh2 0 Rm
+      A5-24   2 | 16  cond 0101 UB1L Rn Rd imm12
+      A5-26   2 | 32  cond 0111 UB1L Rn Rd imm5  sh2 0 Rm
+      A5-28   3 | 16  cond 0100 UB0L Rn Rd imm12
+      A5-32   3 | 32  cond 0110 UB0L Rn Rd imm5  sh2 0 Rm
+   */
+   /* case coding:
+             1   at-ea               (access at ea)
+             2   at-ea-then-upd      (access at ea, then Rn = ea)
+             3   at-Rn-then-upd      (access at Rn, then Rn = ea)
+      ea coding
+             16  Rn +/- imm12
+             32  Rn +/- Rm sh2 imm5
+   */
+   /* Quickly skip over all of this for hopefully most instructions */
+   if ((INSN(27,24) & BITS4(1,1,0,0)) != BITS4(0,1,0,0))
+      goto after_load_store_ubyte_or_word;
+
+   summary = 0;
+   
+   /**/ if (INSN(27,24) == BITS4(0,1,0,1) && INSN(21,21) == 0) {
+      summary = 1 | 16;
+   }
+   else if (INSN(27,24) == BITS4(0,1,1,1) && INSN(21,21) == 0
+                                          && INSN(4,4) == 0) {
+      summary = 1 | 32;
+   }
+   else if (INSN(27,24) == BITS4(0,1,0,1) && INSN(21,21) == 1) {
+      summary = 2 | 16;
+   }
+   else if (INSN(27,24) == BITS4(0,1,1,1) && INSN(21,21) == 1
+                                          && INSN(4,4) == 0) {
+      summary = 2 | 32;
+   }
+   else if (INSN(27,24) == BITS4(0,1,0,0) && INSN(21,21) == 0) {
+      summary = 3 | 16;
+   }
+   else if (INSN(27,24) == BITS4(0,1,1,0) && INSN(21,21) == 0
+                                          && INSN(4,4) == 0) {
+      summary = 3 | 32;
+   }
+   else goto after_load_store_ubyte_or_word;
+
+   { UInt rN = (insn >> 16) & 0xF; /* 19:16 */
+     UInt rD = (insn >> 12) & 0xF; /* 15:12 */
+     UInt rM = (insn >> 0)  & 0xF; /*  3:0  */
+     UInt bU = (insn >> 23) & 1;      /* 23 */
+     UInt bB = (insn >> 22) & 1;      /* 22 */
+     UInt bL = (insn >> 20) & 1;      /* 20 */
+     UInt imm12 = (insn >> 0) & 0xFFF; /* 11:0 */
+     UInt imm5  = (insn >> 7) & 0x1F;  /* 11:7 */
+     UInt sh2   = (insn >> 5) & 3;     /* 6:5 */
+
+     /* Skip some invalid cases, which would lead to two competing
+        updates to the same register, or which are otherwise
+        disallowed by the spec. */
+     switch (summary) {
+        case 1 | 16:
+           break;
+        case 1 | 32: 
+           if (rM == 15) goto after_load_store_ubyte_or_word;
+           break;
+        case 2 | 16: case 3 | 16:
+           if (rN == 15) goto after_load_store_ubyte_or_word;
+           if (bL == 1 && rN == rD) goto after_load_store_ubyte_or_word;
+           break;
+        case 2 | 32: case 3 | 32:
+           if (rM == 15) goto after_load_store_ubyte_or_word;
+           if (rN == 15) goto after_load_store_ubyte_or_word;
+           if (rN == rM) goto after_load_store_ubyte_or_word;
+           if (bL == 1 && rN == rD) goto after_load_store_ubyte_or_word;
+           break;
+        default:
+           vassert(0);
+     }
+
+     /* Now, we can't do a conditional load or store, since that very
+        likely will generate an exception.  So we have to take a side
+        exit at this point if the condition is false. */
+     if (condT != IRTemp_INVALID) {
+        mk_skip_over_A32_if_cond_is_false( condT );
+        condT = IRTemp_INVALID;
+     }
+     /* Ok, now we're unconditional.  Do the load or store. */
+
+     /* compute the effective address.  Bind it to a tmp since we
+        may need to use it twice. */
+     IRExpr* eaE = NULL;
+     switch (summary & 0xF0) {
+        case 16:
+           eaE = mk_EA_reg_plusminus_imm12( rN, bU, imm12, dis_buf );
+           break;
+        case 32:
+           eaE = mk_EA_reg_plusminus_shifted_reg( rN, bU, rM, sh2, imm5,
+                                                  dis_buf );
+           break;
+     }
+     vassert(eaE);
+     IRTemp eaT = newTemp(Ity_I32);
+     assign(eaT, eaE);
+
+     /* get the old Rn value */
+     IRTemp rnT = newTemp(Ity_I32);
+     assign(rnT, getIRegA(rN));
+
+     /* decide on the transfer address */
+     IRTemp taT = IRTemp_INVALID;
+     switch (summary & 0x0F) {
+        case 1: case 2: taT = eaT; break;
+        case 3:         taT = rnT; break;
+     }
+     vassert(taT != IRTemp_INVALID);
+
+     if (bL == 0) {
+       /* Store.  If necessary, update the base register before the
+          store itself, so that the common idiom of "str rX, [sp,
+          #-4]!" (store rX at sp-4, then do new sp = sp-4, a.k.a "push
+          rX") doesn't cause Memcheck to complain that the access is
+          below the stack pointer.  Also, not updating sp before the
+          store confuses Valgrind's dynamic stack-extending logic.  So
+          do it before the store.  Hence we need to snarf the store
+          data before doing the basereg update. */
+
+        /* get hold of the data to be stored */
+        IRTemp rDt = newTemp(Ity_I32);
+        assign(rDt, getIRegA(rD));
+
+        /* Update Rn if necessary. */
+        switch (summary & 0x0F) {
+           case 2: case 3:
+              putIRegA( rN, mkexpr(eaT), IRTemp_INVALID, Ijk_Boring );
+              break;
+        }
+
+        /* generate the transfer */
+        if (bB == 0) { // word store
+           storeLE( mkexpr(taT), mkexpr(rDt) );
+        } else { // byte store
+           vassert(bB == 1);
+           storeLE( mkexpr(taT), unop(Iop_32to8, mkexpr(rDt)) );
+        }
+
+     } else {
+        /* Load */
+        vassert(bL == 1);
+
+        /* generate the transfer */
+        if (bB == 0) { // word load
+           putIRegA( rD, loadLE(Ity_I32, mkexpr(taT)),
+                     IRTemp_INVALID, Ijk_Boring );
+        } else { // byte load
+           vassert(bB == 1);
+           putIRegA( rD, unop(Iop_8Uto32, loadLE(Ity_I8, mkexpr(taT))),
+                     IRTemp_INVALID, Ijk_Boring );
+        }
+
+        /* Update Rn if necessary. */
+        switch (summary & 0x0F) {
+           case 2: case 3:
+              // should be assured by logic above:
+              if (bL == 1)
+                 vassert(rD != rN); /* since we just wrote rD */
+              putIRegA( rN, mkexpr(eaT), IRTemp_INVALID, Ijk_Boring );
+              break;
+        }
+     }
+ 
+     switch (summary & 0x0F) {
+        case 1:  DIP("%sr%s%s r%u, %s\n",
+                     bL == 0 ? "st" : "ld",
+                     bB == 0 ? "" : "b", nCC(INSN_COND), rD, dis_buf);
+                 break;
+        case 2:  DIP("%sr%s%s r%u, %s! (at-EA-then-Rn=EA)\n",
+                     bL == 0 ? "st" : "ld",
+                     bB == 0 ? "" : "b", nCC(INSN_COND), rD, dis_buf);
+                 break;
+        case 3:  DIP("%sr%s%s r%u, %s! (at-Rn-then-Rn=EA)\n",
+                     bL == 0 ? "st" : "ld",
+                     bB == 0 ? "" : "b", nCC(INSN_COND), rD, dis_buf);
+                 break;
+        default: vassert(0);
+     }
+
+     /* XXX deal with alignment constraints */
+
+     goto decode_success;
+
+     /* Complications:
+
+        For all loads: if the Amode specifies base register
+        writeback, and the same register is specified for Rd and Rn,
+        the results are UNPREDICTABLE.
+
+        For all loads and stores: if R15 is written, branch to
+        that address afterwards.
+
+        STRB: straightforward
+        LDRB: loaded data is zero extended
+        STR:  lowest 2 bits of address are ignored
+        LDR:  if the lowest 2 bits of the address are nonzero
+              then the loaded value is rotated right by 8 * the lowest 2 bits
+     */
+   }
+
+  after_load_store_ubyte_or_word:
+
+   /* --------------------- Load/store (sbyte & hword) -------- */
+   // LDRH LDRSH STRH LDRSB
+   /*                 31   27   23   19 15 11   7    3     # highest bit
+                        28   24   20 16 12    8    4    0
+      A5-36   1 | 16  cond 0001 U10L Rn Rd im4h 1SH1 im4l
+      A5-38   1 | 32  cond 0001 U00L Rn Rd 0000 1SH1 Rm
+      A5-40   2 | 16  cond 0001 U11L Rn Rd im4h 1SH1 im4l
+      A5-42   2 | 32  cond 0001 U01L Rn Rd 0000 1SH1 Rm
+      A5-44   3 | 16  cond 0000 U10L Rn Rd im4h 1SH1 im4l
+      A5-46   3 | 32  cond 0000 U00L Rn Rd 0000 1SH1 Rm
+   */
+   /* case coding:
+             1   at-ea               (access at ea)
+             2   at-ea-then-upd      (access at ea, then Rn = ea)
+             3   at-Rn-then-upd      (access at Rn, then Rn = ea)
+      ea coding
+             16  Rn +/- imm8
+             32  Rn +/- Rm
+   */
+   /* Quickly skip over all of this for hopefully most instructions */
+   if ((INSN(27,24) & BITS4(1,1,1,0)) != BITS4(0,0,0,0))
+      goto after_load_store_sbyte_or_hword;
+
+   /* Check the "1SH1" thing. */
+   if ((INSN(7,4) & BITS4(1,0,0,1)) != BITS4(1,0,0,1))
+      goto after_load_store_sbyte_or_hword;
+
+   summary = 0;
+
+   /**/ if (INSN(27,24) == BITS4(0,0,0,1) && INSN(22,21) == BITS2(1,0)) {
+      summary = 1 | 16;
+   }
+   else if (INSN(27,24) == BITS4(0,0,0,1) && INSN(22,21) == BITS2(0,0)) {
+      summary = 1 | 32;
+   }
+   else if (INSN(27,24) == BITS4(0,0,0,1) && INSN(22,21) == BITS2(1,1)) {
+      summary = 2 | 16;
+   }
+   else if (INSN(27,24) == BITS4(0,0,0,1) && INSN(22,21) == BITS2(0,1)) {
+      summary = 2 | 32;
+   }
+   else if (INSN(27,24) == BITS4(0,0,0,0) && INSN(22,21) == BITS2(1,0)) {
+      summary = 3 | 16;
+   }
+   else if (INSN(27,24) == BITS4(0,0,0,0) && INSN(22,21) == BITS2(0,0)) {
+      summary = 3 | 32;
+   }
+   else goto after_load_store_sbyte_or_hword;
+
+   { UInt rN   = (insn >> 16) & 0xF; /* 19:16 */
+     UInt rD   = (insn >> 12) & 0xF; /* 15:12 */
+     UInt rM   = (insn >> 0)  & 0xF; /*  3:0  */
+     UInt bU   = (insn >> 23) & 1;   /* 23 U=1 offset+, U=0 offset- */
+     UInt bL   = (insn >> 20) & 1;   /* 20 L=1 load, L=0 store */
+     UInt bH   = (insn >> 5) & 1;    /* H=1 halfword, H=0 byte */
+     UInt bS   = (insn >> 6) & 1;    /* S=1 signed, S=0 unsigned */
+     UInt imm8 = ((insn >> 4) & 0xF0) | (insn & 0xF); /* 11:8, 3:0 */
+
+     /* Skip combinations that are either meaningless or already
+        handled by main word-or-unsigned-byte load-store
+        instructions. */
+     if (bS == 0 && bH == 0) /* "unsigned byte" */
+        goto after_load_store_sbyte_or_hword;
+     if (bS == 1 && bL == 0) /* "signed store" */
+        goto after_load_store_sbyte_or_hword;
+
+     /* Require 11:8 == 0 for Rn +/- Rm cases */
+     if ((summary & 32) != 0 && (imm8 & 0xF0) != 0)
+        goto after_load_store_sbyte_or_hword;
+
+     /* Skip some invalid cases, which would lead to two competing
+        updates to the same register, or which are otherwise
+        disallowed by the spec. */
+     switch (summary) {
+        case 1 | 16:
+           break;
+        case 1 | 32: 
+           if (rM == 15) goto after_load_store_sbyte_or_hword;
+           break;
+        case 2 | 16: case 3 | 16:
+           if (rN == 15) goto after_load_store_sbyte_or_hword;
+           if (bL == 1 && rN == rD) goto after_load_store_sbyte_or_hword;
+           break;
+        case 2 | 32: case 3 | 32:
+           if (rM == 15) goto after_load_store_sbyte_or_hword;
+           if (rN == 15) goto after_load_store_sbyte_or_hword;
+           if (rN == rM) goto after_load_store_sbyte_or_hword;
+           if (bL == 1 && rN == rD) goto after_load_store_sbyte_or_hword;
+           break;
+        default:
+           vassert(0);
+     }
+
+     /* Now, we can't do a conditional load or store, since that very
+        likely will generate an exception.  So we have to take a side
+        exit at this point if the condition is false. */
+     if (condT != IRTemp_INVALID) {
+        mk_skip_over_A32_if_cond_is_false( condT );
+        condT = IRTemp_INVALID;
+     }
+     /* Ok, now we're unconditional.  Do the load or store. */
+
+     /* compute the effective address.  Bind it to a tmp since we
+        may need to use it twice. */
+     IRExpr* eaE = NULL;
+     switch (summary & 0xF0) {
+        case 16:
+           eaE = mk_EA_reg_plusminus_imm8( rN, bU, imm8, dis_buf );
+           break;
+        case 32:
+           eaE = mk_EA_reg_plusminus_reg( rN, bU, rM, dis_buf );
+           break;
+     }
+     vassert(eaE);
+     IRTemp eaT = newTemp(Ity_I32);
+     assign(eaT, eaE);
+
+     /* get the old Rn value */
+     IRTemp rnT = newTemp(Ity_I32);
+     assign(rnT, getIRegA(rN));
+
+     /* decide on the transfer address */
+     IRTemp taT = IRTemp_INVALID;
+     switch (summary & 0x0F) {
+        case 1: case 2: taT = eaT; break;
+        case 3:         taT = rnT; break;
+     }
+     vassert(taT != IRTemp_INVALID);
+
+     /* halfword store  H 1  L 0  S 0
+        uhalf load      H 1  L 1  S 0
+        shalf load      H 1  L 1  S 1
+        sbyte load      H 0  L 1  S 1
+     */
+     HChar* name = NULL;
+     /* generate the transfer */
+     /**/ if (bH == 1 && bL == 0 && bS == 0) { // halfword store
+        storeLE( mkexpr(taT), unop(Iop_32to16, getIRegA(rD)) );
+        name = "strh";
+     }
+     else if (bH == 1 && bL == 1 && bS == 0) { // uhalf load
+        putIRegA( rD, unop(Iop_16Uto32, loadLE(Ity_I16, mkexpr(taT))),
+                  IRTemp_INVALID, Ijk_Boring );
+        name = "ldrh";
+     }
+     else if (bH == 1 && bL == 1 && bS == 1) { // shalf load
+        putIRegA( rD, unop(Iop_16Sto32, loadLE(Ity_I16, mkexpr(taT))),
+                  IRTemp_INVALID, Ijk_Boring );
+        name = "ldrsh";
+     }
+     else if (bH == 0 && bL == 1 && bS == 1) { // sbyte load
+        putIRegA( rD, unop(Iop_8Sto32, loadLE(Ity_I8, mkexpr(taT))),
+                  IRTemp_INVALID, Ijk_Boring );
+        name = "ldrsb";
+     }
+     else
+        vassert(0); // should be assured by logic above
+
+     /* Update Rn if necessary. */
+     switch (summary & 0x0F) {
+        case 2: case 3:
+           // should be assured by logic above:
+           if (bL == 1)
+              vassert(rD != rN); /* since we just wrote rD */
+           putIRegA( rN, mkexpr(eaT), IRTemp_INVALID, Ijk_Boring );
+           break;
+     }
+
+     switch (summary & 0x0F) {
+        case 1:  DIP("%s%s r%u, %s\n", name, nCC(INSN_COND), rD, dis_buf);
+                 break;
+        case 2:  DIP("%s%s r%u, %s! (at-EA-then-Rn=EA)\n",
+                     name, nCC(INSN_COND), rD, dis_buf);
+                 break;
+        case 3:  DIP("%s%s r%u, %s! (at-Rn-then-Rn=EA)\n",
+                     name, nCC(INSN_COND), rD, dis_buf);
+                 break;
+        default: vassert(0);
+     }
+
+     /* XXX deal with alignment constraints */
+
+     goto decode_success;
+
+     /* Complications:
+
+        For all loads: if the Amode specifies base register
+        writeback, and the same register is specified for Rd and Rn,
+        the results are UNPREDICTABLE.
+
+        For all loads and stores: if R15 is written, branch to
+        that address afterwards.
+
+        Misaligned halfword stores => Unpredictable
+        Misaligned halfword loads  => Unpredictable
+     */
+   }
+
+  after_load_store_sbyte_or_hword:
+
+   /* --------------------- Load/store multiple -------------- */
+   // LD/STMIA LD/STMIB LD/STMDA LD/STMDB
+   // Remarkably complex and difficult to get right
+   // match 27:20 as 100XX0WL
+   if (BITS8(1,0,0,0,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,0,0,1,0,0))) {
+      // A5-50 LD/STMIA  cond 1000 10WL Rn RegList
+      // A5-51 LD/STMIB  cond 1001 10WL Rn RegList
+      // A5-53 LD/STMDA  cond 1000 00WL Rn RegList
+      // A5-53 LD/STMDB  cond 1001 00WL Rn RegList
+      //                   28   24   20 16       0
+
+      UInt bINC    = (insn >> 23) & 1;
+      UInt bBEFORE = (insn >> 24) & 1;
+
+      UInt bL      = (insn >> 20) & 1;  /* load=1, store=0 */
+      UInt bW      = (insn >> 21) & 1;  /* Rn wback=1, no wback=0 */
+      UInt rN      = (insn >> 16) & 0xF;
+      UInt regList = insn & 0xFFFF;
+      /* Skip some invalid cases, which would lead to two competing
+         updates to the same register, or which are otherwise
+         disallowed by the spec.  Note the test above has required
+         that S == 0, since that looks like a kernel-mode only thing.
+         Done by forcing the real pattern, viz 100XXSWL to actually be
+         100XX0WL. */
+      if (rN == 15) goto after_load_store_multiple;
+      // reglist can't be empty
+      if (regList == 0) goto after_load_store_multiple;
+      // if requested to writeback Rn, and this is a load instruction,
+      // then Rn can't appear in RegList, since we'd have two competing
+      // new values for Rn.  We do however accept this case for store
+      // instructions.
+      if (bW == 1 && bL == 1 && ((1 << rN) & regList) > 0)
+         goto after_load_store_multiple;
+
+      /* Now, we can't do a conditional load or store, since that very
+         likely will generate an exception.  So we have to take a side
+         exit at this point if the condition is false. */
+      if (condT != IRTemp_INVALID) {
+         mk_skip_over_A32_if_cond_is_false( condT );
+         condT = IRTemp_INVALID;
+      }
+
+      /* Ok, now we're unconditional.  Generate the IR. */
+      mk_ldm_stm( True/*arm*/, rN, bINC, bBEFORE, bW, bL, regList );
+
+      DIP("%sm%c%c%s r%u%s, {0x%04x}\n",
+          bL == 1 ? "ld" : "st", bINC ? 'i' : 'd', bBEFORE ? 'b' : 'a',
+          nCC(INSN_COND),
+          rN, bW ? "!" : "", regList);
+
+      goto decode_success;
+   }
+
+  after_load_store_multiple:
+
+   /* --------------------- Control flow --------------------- */
+   // B, BL (Branch, or Branch-and-Link, to immediate offset)
+   //
+   if (BITS8(1,0,1,0,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,0,0,0,0,0))) {
+      UInt link   = (insn >> 24) & 1;
+      UInt uimm24 = insn & ((1<<24)-1);
+      Int  simm24 = (Int)uimm24;
+      UInt dst    = guest_R15_curr_instr_notENC + 8
+                    + (((simm24 << 8) >> 8) << 2);
+      IRJumpKind jk = link ? Ijk_Call : Ijk_Boring;
+      if (link) {
+         putIRegA(14, mkU32(guest_R15_curr_instr_notENC + 4),
+                      condT, Ijk_Boring);
+      }
+      if (condT == IRTemp_INVALID) {
+         /* unconditional transfer to 'dst'.  See if we can simply
+            continue tracing at the destination. */
+         if (resteerOkFn( callback_opaque, (Addr64)dst )) {
+            /* yes */
+            dres.whatNext   = Dis_ResteerU;
+            dres.continueAt = (Addr64)dst;
+         } else {
+            /* no; terminate the SB at this point. */
+            irsb->next     = mkU32(dst);
+            irsb->jumpkind = jk;
+            dres.whatNext  = Dis_StopHere;
+         }
+         DIP("b%s 0x%x\n", link ? "l" : "", dst);
+      } else {
+         /* conditional transfer to 'dst' */
+         HChar* comment = "";
+
+         /* First see if we can do some speculative chasing into one
+            arm or the other.  Be conservative and only chase if
+            !link, that is, this is a normal conditional branch to a
+            known destination. */
+         if (!link
+             && resteerCisOk
+             && vex_control.guest_chase_cond
+             && dst < guest_R15_curr_instr_notENC
+             && resteerOkFn( callback_opaque, (Addr64)(Addr32)dst) ) {
+            /* Speculation: assume this backward branch is taken.  So
+               we need to emit a side-exit to the insn following this
+               one, on the negation of the condition, and continue at
+               the branch target address (dst). */
+            stmt( IRStmt_Exit( unop(Iop_Not1,
+                                    unop(Iop_32to1, mkexpr(condT))),
+                               Ijk_Boring,
+                               IRConst_U32(guest_R15_curr_instr_notENC+4) ));
+            dres.whatNext   = Dis_ResteerC;
+            dres.continueAt = (Addr64)(Addr32)dst;
+            comment = "(assumed taken)";
+         }
+         else
+         if (!link
+             && resteerCisOk
+             && vex_control.guest_chase_cond
+             && dst >= guest_R15_curr_instr_notENC
+             && resteerOkFn( callback_opaque, 
+                             (Addr64)(Addr32)
+                                     (guest_R15_curr_instr_notENC+4)) ) {
+            /* Speculation: assume this forward branch is not taken.
+               So we need to emit a side-exit to dst (the dest) and
+               continue disassembling at the insn immediately
+               following this one. */
+            stmt( IRStmt_Exit( unop(Iop_32to1, mkexpr(condT)),
+                               Ijk_Boring,
+                               IRConst_U32(dst) ));
+            dres.whatNext   = Dis_ResteerC;
+            dres.continueAt = (Addr64)(Addr32)
+                                      (guest_R15_curr_instr_notENC+4);
+            comment = "(assumed not taken)";
+         }
+         else {
+            /* Conservative default translation - end the block at
+               this point. */
+            stmt( IRStmt_Exit( unop(Iop_32to1, mkexpr(condT)),
+                               jk, IRConst_U32(dst) ));
+            irsb->next     = mkU32(guest_R15_curr_instr_notENC + 4);
+            irsb->jumpkind = jk;
+            dres.whatNext  = Dis_StopHere;
+         }
+         DIP("b%s%s 0x%x %s\n", link ? "l" : "", nCC(INSN_COND),
+             dst, comment);
+      }
+      goto decode_success;
+   }
+
+   // B, BL (Branch, or Branch-and-Link, to a register)
+   // NB: interworking branch
+   if (INSN(27,20) == BITS8(0,0,0,1,0,0,1,0)
+       && INSN(19,12) == BITS8(1,1,1,1,1,1,1,1)
+       && (INSN(11,4) == BITS8(1,1,1,1,0,0,1,1)
+           || INSN(11,4) == BITS8(1,1,1,1,0,0,0,1))) {
+      IRExpr* dst;
+      UInt    link = (INSN(11,4) >> 1) & 1;
+      UInt    rM   = INSN(3,0);
+      // we don't decode the case (link && rM == 15), as that's
+      // Unpredictable.
+      if (!(link && rM == 15)) {
+         if (condT != IRTemp_INVALID) {
+            mk_skip_over_A32_if_cond_is_false( condT );
+         }
+         // rM contains an interworking address exactly as we require
+         // (with continuation CPSR.T in bit 0), so we can use it
+         // as-is, with no masking.
+         dst = getIRegA(rM);
+         if (link) {
+            putIRegA( 14, mkU32(guest_R15_curr_instr_notENC + 4),
+                      IRTemp_INVALID/*because AL*/, Ijk_Boring );
+         }
+         irsb->next     = dst;
+         irsb->jumpkind = link ? Ijk_Call
+                               : (rM == 14 ? Ijk_Ret : Ijk_Boring);
+         dres.whatNext  = Dis_StopHere;
+         if (condT == IRTemp_INVALID) {
+            DIP("b%sx r%u\n", link ? "l" : "", rM);
+         } else {
+            DIP("b%sx%s r%u\n", link ? "l" : "", nCC(INSN_COND), rM);
+         }
+         goto decode_success;
+      }
+      /* else: (link && rM == 15): just fall through */
+   }
+
+   /* --- NB: ARM interworking branches are in NV space, hence
+      are handled elsewhere by decode_NV_instruction.
+      ---
+   */
+
+   /* --------------------- Clz --------------------- */
+   // CLZ
+   if (INSN(27,20) == BITS8(0,0,0,1,0,1,1,0)
+       && INSN(19,16) == BITS4(1,1,1,1)
+       && INSN(11,4) == BITS8(1,1,1,1,0,0,0,1)) {
+      UInt rD = INSN(15,12);
+      UInt rM = INSN(3,0);
+      IRTemp arg = newTemp(Ity_I32);
+      IRTemp res = newTemp(Ity_I32);
+      assign(arg, getIRegA(rM));
+      assign(res, IRExpr_Mux0X(
+                     unop(Iop_1Uto8,binop(Iop_CmpEQ32, mkexpr(arg),
+                                                       mkU32(0))),
+                     unop(Iop_Clz32, mkexpr(arg)),
+                     mkU32(32)
+            ));
+      putIRegA(rD, mkexpr(res), condT, Ijk_Boring);
+      DIP("clz%s r%u, r%u\n", nCC(INSN_COND), rD, rM);
+      goto decode_success;
+   }
+
+   /* --------------------- Mul etc --------------------- */
+   // MUL
+   if (BITS8(0,0,0,0,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,1,1,1,0))
+       && INSN(15,12) == BITS4(0,0,0,0)
+       && INSN(7,4) == BITS4(1,0,0,1)) {
+      UInt bitS = (insn >> 20) & 1; /* 20:20 */
+      UInt rD = INSN(19,16);
+      UInt rS = INSN(11,8);
+      UInt rM = INSN(3,0);
+      if (rD == 15 || rM == 15 || rS == 15) {
+         /* Unpredictable; don't decode; fall through */
+      } else {
+         IRTemp argL = newTemp(Ity_I32);
+         IRTemp argR = newTemp(Ity_I32);
+         IRTemp res  = newTemp(Ity_I32);
+         IRTemp oldC = IRTemp_INVALID;
+         IRTemp oldV = IRTemp_INVALID;
+         assign( argL, getIRegA(rM));
+         assign( argR, getIRegA(rS));
+         assign( res, binop(Iop_Mul32, mkexpr(argL), mkexpr(argR)) );
+         if (bitS) {
+            oldC = newTemp(Ity_I32);
+            assign(oldC, mk_armg_calculate_flag_c());
+            oldV = newTemp(Ity_I32);
+            assign(oldV, mk_armg_calculate_flag_v());
+         }
+         // now update guest state
+         putIRegA( rD, mkexpr(res), condT, Ijk_Boring );
+         if (bitS) {
+            IRTemp pair = newTemp(Ity_I32);
+            assign( pair, binop(Iop_Or32,
+                                binop(Iop_Shl32, mkexpr(oldC), mkU8(1)),
+                                mkexpr(oldV)) );
+            setFlags_D1_ND( ARMG_CC_OP_MUL, res, pair, condT );
+         }
+         DIP("mul%c%s r%u, r%u, r%u\n",
+             bitS ? 's' : ' ', nCC(INSN_COND), rD, rM, rS);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   // MLA, MLS
+   if (BITS8(0,0,0,0,0,0,1,0) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,0))
+       && INSN(7,4) == BITS4(1,0,0,1)) {
+      UInt bitS  = (insn >> 20) & 1; /* 20:20 */
+      UInt isMLS = (insn >> 22) & 1; /* 22:22 */
+      UInt rD = INSN(19,16);
+      UInt rN = INSN(15,12);
+      UInt rS = INSN(11,8);
+      UInt rM = INSN(3,0);
+      if (bitS == 1 && isMLS == 1) {
+         /* This isn't allowed (MLS that sets flags).  don't decode;
+            fall through */
+      }
+      else
+      if (rD == 15 || rM == 15 || rS == 15 || rN == 15) {
+         /* Unpredictable; don't decode; fall through */
+      } else {
+         IRTemp argL = newTemp(Ity_I32);
+         IRTemp argR = newTemp(Ity_I32);
+         IRTemp argP = newTemp(Ity_I32);
+         IRTemp res  = newTemp(Ity_I32);
+         IRTemp oldC = IRTemp_INVALID;
+         IRTemp oldV = IRTemp_INVALID;
+         assign( argL, getIRegA(rM));
+         assign( argR, getIRegA(rS));
+         assign( argP, getIRegA(rN));
+         assign( res, binop(isMLS ? Iop_Sub32 : Iop_Add32,
+                            mkexpr(argP),
+                            binop(Iop_Mul32, mkexpr(argL), mkexpr(argR)) ));
+         if (bitS) {
+            vassert(!isMLS); // guaranteed above
+            oldC = newTemp(Ity_I32);
+            assign(oldC, mk_armg_calculate_flag_c());
+            oldV = newTemp(Ity_I32);
+            assign(oldV, mk_armg_calculate_flag_v());
+         }
+         // now update guest state
+         putIRegA( rD, mkexpr(res), condT, Ijk_Boring );
+         if (bitS) {
+            IRTemp pair = newTemp(Ity_I32);
+            assign( pair, binop(Iop_Or32,
+                                binop(Iop_Shl32, mkexpr(oldC), mkU8(1)),
+                                mkexpr(oldV)) );
+            setFlags_D1_ND( ARMG_CC_OP_MUL, res, pair, condT );
+         }
+         DIP("ml%c%c%s r%u, r%u, r%u, r%u\n",
+             isMLS ? 's' : 'a', bitS ? 's' : ' ',
+             nCC(INSN_COND), rD, rM, rS, rN);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   // SMULL, UMULL
+   if (BITS8(0,0,0,0,1,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,0))
+       && INSN(7,4) == BITS4(1,0,0,1)) {
+      UInt bitS = (insn >> 20) & 1; /* 20:20 */
+      UInt rDhi = INSN(19,16);
+      UInt rDlo = INSN(15,12);
+      UInt rS   = INSN(11,8);
+      UInt rM   = INSN(3,0);
+      UInt isS  = (INSN(27,20) >> 2) & 1; /* 22:22 */
+      if (rDhi == 15 || rDlo == 15 || rM == 15 || rS == 15 || rDhi == rDlo)  {
+         /* Unpredictable; don't decode; fall through */
+      } else {
+         IRTemp argL  = newTemp(Ity_I32);
+         IRTemp argR  = newTemp(Ity_I32);
+         IRTemp res   = newTemp(Ity_I64);
+         IRTemp resHi = newTemp(Ity_I32);
+         IRTemp resLo = newTemp(Ity_I32);
+         IRTemp oldC  = IRTemp_INVALID;
+         IRTemp oldV  = IRTemp_INVALID;
+         IROp   mulOp = isS ? Iop_MullS32 : Iop_MullU32;
+         assign( argL, getIRegA(rM));
+         assign( argR, getIRegA(rS));
+         assign( res, binop(mulOp, mkexpr(argL), mkexpr(argR)) );
+         assign( resHi, unop(Iop_64HIto32, mkexpr(res)) );
+         assign( resLo, unop(Iop_64to32, mkexpr(res)) );
+         if (bitS) {
+            oldC = newTemp(Ity_I32);
+            assign(oldC, mk_armg_calculate_flag_c());
+            oldV = newTemp(Ity_I32);
+            assign(oldV, mk_armg_calculate_flag_v());
+         }
+         // now update guest state
+         putIRegA( rDhi, mkexpr(resHi), condT, Ijk_Boring );
+         putIRegA( rDlo, mkexpr(resLo), condT, Ijk_Boring );
+         if (bitS) {
+            IRTemp pair = newTemp(Ity_I32);
+            assign( pair, binop(Iop_Or32,
+                                binop(Iop_Shl32, mkexpr(oldC), mkU8(1)),
+                                mkexpr(oldV)) );
+            setFlags_D1_D2_ND( ARMG_CC_OP_MULL, resLo, resHi, pair, condT );
+         }
+         DIP("%cmull%c%s r%u, r%u, r%u, r%u\n",
+             isS ? 's' : 'u', bitS ? 's' : ' ',
+             nCC(INSN_COND), rDlo, rDhi, rM, rS);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   // SMLAL, UMLAL
+   if (BITS8(0,0,0,0,1,0,1,0) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,0))
+       && INSN(7,4) == BITS4(1,0,0,1)) {
+      UInt bitS = (insn >> 20) & 1; /* 20:20 */
+      UInt rDhi = INSN(19,16);
+      UInt rDlo = INSN(15,12);
+      UInt rS   = INSN(11,8);
+      UInt rM   = INSN(3,0);
+      UInt isS  = (INSN(27,20) >> 2) & 1; /* 22:22 */
+      if (rDhi == 15 || rDlo == 15 || rM == 15 || rS == 15 || rDhi == rDlo)  {
+         /* Unpredictable; don't decode; fall through */
+      } else {
+         IRTemp argL  = newTemp(Ity_I32);
+         IRTemp argR  = newTemp(Ity_I32);
+         IRTemp old   = newTemp(Ity_I64);
+         IRTemp res   = newTemp(Ity_I64);
+         IRTemp resHi = newTemp(Ity_I32);
+         IRTemp resLo = newTemp(Ity_I32);
+         IRTemp oldC  = IRTemp_INVALID;
+         IRTemp oldV  = IRTemp_INVALID;
+         IROp   mulOp = isS ? Iop_MullS32 : Iop_MullU32;
+         assign( argL, getIRegA(rM));
+         assign( argR, getIRegA(rS));
+         assign( old, binop(Iop_32HLto64, getIRegA(rDhi), getIRegA(rDlo)) );
+         assign( res, binop(Iop_Add64,
+                            mkexpr(old),
+                            binop(mulOp, mkexpr(argL), mkexpr(argR))) );
+         assign( resHi, unop(Iop_64HIto32, mkexpr(res)) );
+         assign( resLo, unop(Iop_64to32, mkexpr(res)) );
+         if (bitS) {
+            oldC = newTemp(Ity_I32);
+            assign(oldC, mk_armg_calculate_flag_c());
+            oldV = newTemp(Ity_I32);
+            assign(oldV, mk_armg_calculate_flag_v());
+         }
+         // now update guest state
+         putIRegA( rDhi, mkexpr(resHi), condT, Ijk_Boring );
+         putIRegA( rDlo, mkexpr(resLo), condT, Ijk_Boring );
+         if (bitS) {
+            IRTemp pair = newTemp(Ity_I32);
+            assign( pair, binop(Iop_Or32,
+                                binop(Iop_Shl32, mkexpr(oldC), mkU8(1)),
+                                mkexpr(oldV)) );
+            setFlags_D1_D2_ND( ARMG_CC_OP_MULL, resLo, resHi, pair, condT );
+         }
+         DIP("%cmlal%c%s r%u, r%u, r%u, r%u\n",
+             isS ? 's' : 'u', bitS ? 's' : ' ', nCC(INSN_COND),
+             rDlo, rDhi, rM, rS);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   /* --------------------- Msr etc --------------------- */
+
+   // MSR apsr, #imm
+   if (INSN(27,20) == BITS8(0,0,1,1,0,0,1,0)
+       && INSN(17,12) == BITS6(0,0,1,1,1,1)) {
+      UInt write_ge    = INSN(18,18);
+      UInt write_nzcvq = INSN(19,19);
+      if (write_nzcvq || write_ge) {
+         UInt   imm = (INSN(11,0) >> 0) & 0xFF;
+         UInt   rot = 2 * ((INSN(11,0) >> 8) & 0xF);
+         IRTemp immT = newTemp(Ity_I32);
+         vassert(rot <= 30);
+         imm = ROR32(imm, rot);
+         assign(immT, mkU32(imm));
+         desynthesise_APSR( write_nzcvq, write_ge, immT, condT );
+         DIP("msr%s cpsr%s%sf, #0x%08x\n", nCC(INSN_COND),
+             write_nzcvq ? "f" : "", write_ge ? "g" : "", imm);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   // MSR apsr, reg
+   if (INSN(27,20) == BITS8(0,0,0,1,0,0,1,0) 
+       && INSN(17,12) == BITS6(0,0,1,1,1,1)
+       && INSN(11,4) == BITS8(0,0,0,0,0,0,0,0)) {
+      UInt rN          = INSN(3,0);
+      UInt write_ge    = INSN(18,18);
+      UInt write_nzcvq = INSN(19,19);
+      if (rN != 15 && (write_nzcvq || write_ge)) {
+         IRTemp rNt = newTemp(Ity_I32);
+         assign(rNt, getIRegA(rN));
+         desynthesise_APSR( write_nzcvq, write_ge, rNt, condT );
+         DIP("msr%s cpsr_%s%s, r%u\n", nCC(INSN_COND),
+             write_nzcvq ? "f" : "", write_ge ? "g" : "", rN);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   // MRS rD, cpsr
+   if ((insn & 0x0FFF0FFF) == 0x010F0000) {
+      UInt rD   = INSN(15,12);
+      if (rD != 15) {
+         IRTemp apsr = synthesise_APSR();
+         putIRegA( rD, mkexpr(apsr), condT, Ijk_Boring );
+         DIP("mrs%s r%u, cpsr\n", nCC(INSN_COND), rD);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   /* --------------------- Svc --------------------- */
+   if (BITS8(1,1,1,1,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,0,0,0,0))) {
+      UInt imm24 = (insn >> 0) & 0xFFFFFF;
+      if (imm24 == 0) {
+         /* A syscall.  We can't do this conditionally, hence: */
+         if (condT != IRTemp_INVALID) {
+            mk_skip_over_A32_if_cond_is_false( condT );
+         }
+         // AL after here
+         irsb->next     = mkU32( guest_R15_curr_instr_notENC + 4 );
+         irsb->jumpkind = Ijk_Sys_syscall;
+         dres.whatNext  = Dis_StopHere;
+         DIP("svc%s #0x%08x\n", nCC(INSN_COND), imm24);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   /* ------------------------ swp ------------------------ */
+
+   // SWP, SWPB
+   if (BITS8(0,0,0,1,0,0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(0,0,0,0) == INSN(11,8)
+       && BITS4(1,0,0,1) == INSN(7,4)) {
+      UInt   rN   = INSN(19,16);
+      UInt   rD   = INSN(15,12);
+      UInt   rM   = INSN(3,0);
+      IRTemp tRn  = newTemp(Ity_I32);
+      IRTemp tNew = newTemp(Ity_I32);
+      IRTemp tOld = IRTemp_INVALID;
+      IRTemp tSC1 = newTemp(Ity_I1);
+      UInt   isB  = (insn >> 22) & 1;
+
+      if (rD == 15 || rN == 15 || rM == 15 || rN == rM || rN == rD) {
+         /* undecodable; fall through */
+      } else {
+         /* make unconditional */
+         if (condT != IRTemp_INVALID) {
+            mk_skip_over_A32_if_cond_is_false( condT );
+            condT = IRTemp_INVALID;
+         }
+         /* Ok, now we're unconditional.  Generate a LL-SC loop. */
+         assign(tRn, getIRegA(rN));
+         assign(tNew, getIRegA(rM));
+         if (isB) {
+            /* swpb */
+            tOld = newTemp(Ity_I8);
+            stmt( IRStmt_LLSC(Iend_LE, tOld, mkexpr(tRn),
+                              NULL/*=>isLL*/) );
+            stmt( IRStmt_LLSC(Iend_LE, tSC1, mkexpr(tRn),
+                              unop(Iop_32to8, mkexpr(tNew))) );
+         } else {
+            /* swp */
+            tOld = newTemp(Ity_I32);
+            stmt( IRStmt_LLSC(Iend_LE, tOld, mkexpr(tRn),
+                              NULL/*=>isLL*/) );
+            stmt( IRStmt_LLSC(Iend_LE, tSC1, mkexpr(tRn),
+                              mkexpr(tNew)) );
+         }
+         stmt( IRStmt_Exit(unop(Iop_Not1, mkexpr(tSC1)),
+                           /*Ijk_NoRedir*/Ijk_Boring,
+                           IRConst_U32(guest_R15_curr_instr_notENC)) );
+         putIRegA(rD, isB ? unop(Iop_8Uto32, mkexpr(tOld)) : mkexpr(tOld),
+                      IRTemp_INVALID, Ijk_Boring);
+         DIP("swp%s%s r%u, r%u, [r%u]\n",
+             isB ? "b" : "", nCC(INSN_COND), rD, rM, rN);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- ARMv6 instructions                                    -- */
+   /* ----------------------------------------------------------- */
+
+   /* --------------------- ldrex, strex --------------------- */
+
+   // LDREX
+   if (0x01900F9F == (insn & 0x0FF00FFF)) {
+      UInt rT = INSN(15,12);
+      UInt rN = INSN(19,16);
+      if (rT == 15 || rN == 15) {
+         /* undecodable; fall through */
+      } else {
+         IRTemp res;
+         /* make unconditional */
+         if (condT != IRTemp_INVALID) {
+            mk_skip_over_A32_if_cond_is_false( condT );
+            condT = IRTemp_INVALID;
+         }
+         /* Ok, now we're unconditional.  Do the load. */
+         res = newTemp(Ity_I32);
+         stmt( IRStmt_LLSC(Iend_LE, res, getIRegA(rN),
+                           NULL/*this is a load*/) );
+         putIRegA(rT, mkexpr(res), IRTemp_INVALID, Ijk_Boring);
+         DIP("ldrex%s r%u, [r%u]\n", nCC(INSN_COND), rT, rN);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   // STREX
+   if (0x01800F90 == (insn & 0x0FF00FF0)) {
+      UInt rT = INSN(3,0);
+      UInt rN = INSN(19,16);
+      UInt rD = INSN(15,12);
+      if (rT == 15 || rN == 15 || rD == 15
+          || rD == rT || rD == rN) {
+         /* undecodable; fall through */
+      } else {
+         IRTemp resSC1, resSC32;
+
+         /* make unconditional */
+         if (condT != IRTemp_INVALID) {
+            mk_skip_over_A32_if_cond_is_false( condT );
+            condT = IRTemp_INVALID;
+         }
+
+         /* Ok, now we're unconditional.  Do the store. */
+         resSC1 = newTemp(Ity_I1);
+         stmt( IRStmt_LLSC(Iend_LE, resSC1, getIRegA(rN), getIRegA(rT)) );
+
+         /* Set rD to 1 on failure, 0 on success.  Currently we have
+            resSC1 == 0 on failure, 1 on success. */
+         resSC32 = newTemp(Ity_I32);
+         assign(resSC32,
+                unop(Iop_1Uto32, unop(Iop_Not1, mkexpr(resSC1))));
+
+         putIRegA(rD, mkexpr(resSC32),
+                      IRTemp_INVALID, Ijk_Boring);
+         DIP("strex%s r%u, r%u, [r%u]\n", nCC(INSN_COND), rD, rT, rN);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   /* --------------------- movw, movt --------------------- */
+   if (0x03000000 == (insn & 0x0FF00000)
+       || 0x03400000 == (insn & 0x0FF00000)) /* pray for CSE */ {
+      UInt rD    = INSN(15,12);
+      UInt imm16 = (insn & 0xFFF) | ((insn >> 4) & 0x0000F000);
+      UInt isT   = (insn >> 22) & 1;
+      if (rD == 15) {
+         /* forget it */
+      } else {
+         if (isT) {
+            putIRegA(rD,
+                     binop(Iop_Or32,
+                           binop(Iop_And32, getIRegA(rD), mkU32(0xFFFF)),
+                           mkU32(imm16 << 16)),
+                     condT, Ijk_Boring);
+            DIP("movt%s r%u, #0x%04x\n", nCC(INSN_COND), rD, imm16);
+            goto decode_success;
+         } else {
+            putIRegA(rD, mkU32(imm16), condT, Ijk_Boring);
+            DIP("movw%s r%u, #0x%04x\n", nCC(INSN_COND), rD, imm16);
+            goto decode_success;
+         }
+      }
+      /* fall through */
+   }
+
+   /* ----------- uxtb, sxtb, uxth, sxth, uxtb16, sxtb16 ----------- */
+   /* FIXME: this is an exact duplicate of the Thumb version.  They
+      should be commoned up. */
+   if (BITS8(0,1,1,0,1, 0,0,0) == (INSN(27,20) & BITS8(1,1,1,1,1,0,0,0))
+       && BITS4(1,1,1,1) == INSN(19,16)
+       && BITS4(0,1,1,1) == INSN(7,4)
+       && BITS4(0,0, 0,0) == (INSN(11,8) & BITS4(0,0,1,1))) {
+      UInt subopc = INSN(27,20) & BITS8(0,0,0,0,0, 1,1,1);
+      if (subopc != BITS4(0,0,0,1) && subopc != BITS4(0,1,0,1)) {
+         Int    rot  = (INSN(11,8) >> 2) & 3;
+         UInt   rM   = INSN(3,0);
+         UInt   rD   = INSN(15,12);
+         IRTemp srcT = newTemp(Ity_I32);
+         IRTemp rotT = newTemp(Ity_I32);
+         IRTemp dstT = newTemp(Ity_I32);
+         HChar* nm   = "???";
+         assign(srcT, getIRegA(rM));
+         assign(rotT, genROR32(srcT, 8 * rot)); /* 0, 8, 16 or 24 only */
+         switch (subopc) {
+            case BITS4(0,1,1,0): // UXTB
+               assign(dstT, unop(Iop_8Uto32, unop(Iop_32to8, mkexpr(rotT))));
+               nm = "uxtb";
+               break;
+            case BITS4(0,0,1,0): // SXTB
+               assign(dstT, unop(Iop_8Sto32, unop(Iop_32to8, mkexpr(rotT))));
+               nm = "sxtb";
+               break;
+            case BITS4(0,1,1,1): // UXTH
+               assign(dstT, unop(Iop_16Uto32, unop(Iop_32to16, mkexpr(rotT))));
+               nm = "uxth";
+               break;
+            case BITS4(0,0,1,1): // SXTH
+               assign(dstT, unop(Iop_16Sto32, unop(Iop_32to16, mkexpr(rotT))));
+               nm = "sxth";
+               break;
+            case BITS4(0,1,0,0): // UXTB16
+               assign(dstT, binop(Iop_And32, mkexpr(rotT), mkU32(0x00FF00FF)));
+               nm = "uxtb16";
+               break;
+            case BITS4(0,0,0,0): { // SXTB16
+               IRTemp lo32 = newTemp(Ity_I32);
+               IRTemp hi32 = newTemp(Ity_I32);
+               assign(lo32, binop(Iop_And32, mkexpr(rotT), mkU32(0xFF)));
+               assign(hi32, binop(Iop_Shr32, mkexpr(rotT), mkU8(16)));
+               assign(
+                  dstT,
+                  binop(Iop_Or32,
+                        binop(Iop_And32,
+                              unop(Iop_8Sto32,
+                                   unop(Iop_32to8, mkexpr(lo32))),
+                              mkU32(0xFFFF)),
+                        binop(Iop_Shl32,
+                              unop(Iop_8Sto32,
+                                   unop(Iop_32to8, mkexpr(hi32))),
+                              mkU8(16))
+               ));
+               nm = "sxtb16";
+               break;
+            }
+            default:
+               vassert(0); // guarded by "if" above
+         }
+         putIRegA(rD, mkexpr(dstT), condT, Ijk_Boring);
+         DIP("%s%s r%u, r%u, ROR #%u\n", nm, nCC(INSN_COND), rD, rM, rot);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   /* ------------------- bfi, bfc ------------------- */
+   if (BITS8(0,1,1,1,1,1,0, 0) == (INSN(27,20) & BITS8(1,1,1,1,1,1,1,0))
+       && BITS4(0, 0,0,1) == (INSN(7,4) & BITS4(0,1,1,1))) {
+      UInt rD  = INSN(15,12);
+      UInt rN  = INSN(3,0);
+      UInt msb = (insn >> 16) & 0x1F; /* 20:16 */
+      UInt lsb = (insn >> 7) & 0x1F;  /* 11:7 */
+      if (rD == 15 || msb < lsb) {
+         /* undecodable; fall through */
+      } else {
+         IRTemp src    = newTemp(Ity_I32);
+         IRTemp olddst = newTemp(Ity_I32);
+         IRTemp newdst = newTemp(Ity_I32);
+         UInt   mask = 1 << (msb - lsb);
+         mask = (mask - 1) + mask;
+         vassert(mask != 0); // guaranteed by "msb < lsb" check above
+         mask <<= lsb;
+
+         assign(src, rN == 15 ? mkU32(0) : getIRegA(rN));
+         assign(olddst, getIRegA(rD));
+         assign(newdst,
+                binop(Iop_Or32,
+                   binop(Iop_And32,
+                         binop(Iop_Shl32, mkexpr(src), mkU8(lsb)), 
+                         mkU32(mask)),
+                   binop(Iop_And32,
+                         mkexpr(olddst),
+                         mkU32(~mask)))
+               );
+
+         putIRegA(rD, mkexpr(newdst), condT, Ijk_Boring);
+
+         if (rN == 15) {
+            DIP("bfc%s r%u, #%u, #%u\n",
+                nCC(INSN_COND), rD, lsb, msb-lsb+1);
+         } else {
+            DIP("bfi%s r%u, r%u, #%u, #%u\n",
+                nCC(INSN_COND), rD, rN, lsb, msb-lsb+1);
+         }
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   /* ------------------- {u,s}bfx ------------------- */
+   if (BITS8(0,1,1,1,1,0,1,0) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,0))
+       && BITS4(0,1,0,1) == (INSN(7,4) & BITS4(0,1,1,1))) {
+      UInt rD  = INSN(15,12);
+      UInt rN  = INSN(3,0);
+      UInt wm1 = (insn >> 16) & 0x1F; /* 20:16 */
+      UInt lsb = (insn >> 7) & 0x1F;  /* 11:7 */
+      UInt msb = lsb + wm1;
+      UInt isU = (insn >> 22) & 1;    /* 22:22 */
+      if (rD == 15 || rN == 15 || msb >= 32) {
+         /* undecodable; fall through */
+      } else {
+         IRTemp src  = newTemp(Ity_I32);
+         IRTemp tmp  = newTemp(Ity_I32);
+         IRTemp res  = newTemp(Ity_I32);
+         UInt   mask = ((1 << wm1) - 1) + (1 << wm1);
+         vassert(msb >= 0 && msb <= 31);
+         vassert(mask != 0); // guaranteed by msb being in 0 .. 31 inclusive
+
+         assign(src, getIRegA(rN));
+         assign(tmp, binop(Iop_And32,
+                           binop(Iop_Shr32, mkexpr(src), mkU8(lsb)),
+                           mkU32(mask)));
+         assign(res, binop(isU ? Iop_Shr32 : Iop_Sar32,
+                           binop(Iop_Shl32, mkexpr(tmp), mkU8(31-wm1)),
+                           mkU8(31-wm1)));
+
+         putIRegA(rD, mkexpr(res), condT, Ijk_Boring);
+
+         DIP("%s%s r%u, r%u, #%u, #%u\n",
+             isU ? "ubfx" : "sbfx",
+             nCC(INSN_COND), rD, rN, lsb, wm1 + 1);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   /* --------------------- Load/store doubleword ------------- */
+   // LDRD STRD
+   /*                 31   27   23   19 15 11   7    3     # highest bit
+                        28   24   20 16 12    8    4    0
+      A5-36   1 | 16  cond 0001 U100 Rn Rd im4h 11S1 im4l
+      A5-38   1 | 32  cond 0001 U000 Rn Rd 0000 11S1 Rm
+      A5-40   2 | 16  cond 0001 U110 Rn Rd im4h 11S1 im4l
+      A5-42   2 | 32  cond 0001 U010 Rn Rd 0000 11S1 Rm
+      A5-44   3 | 16  cond 0000 U100 Rn Rd im4h 11S1 im4l
+      A5-46   3 | 32  cond 0000 U000 Rn Rd 0000 11S1 Rm
+   */
+   /* case coding:
+             1   at-ea               (access at ea)
+             2   at-ea-then-upd      (access at ea, then Rn = ea)
+             3   at-Rn-then-upd      (access at Rn, then Rn = ea)
+      ea coding
+             16  Rn +/- imm8
+             32  Rn +/- Rm
+   */
+   /* Quickly skip over all of this for hopefully most instructions */
+   if ((INSN(27,24) & BITS4(1,1,1,0)) != BITS4(0,0,0,0))
+      goto after_load_store_doubleword;
+
+   /* Check the "11S1" thing. */
+   if ((INSN(7,4) & BITS4(1,1,0,1)) != BITS4(1,1,0,1))
+      goto after_load_store_doubleword;
+
+   summary = 0;
+
+   /**/ if (INSN(27,24) == BITS4(0,0,0,1) && INSN(22,20) == BITS3(1,0,0)) {
+      summary = 1 | 16;
+   }
+   else if (INSN(27,24) == BITS4(0,0,0,1) && INSN(22,20) == BITS3(0,0,0)) {
+      summary = 1 | 32;
+   }
+   else if (INSN(27,24) == BITS4(0,0,0,1) && INSN(22,20) == BITS3(1,1,0)) {
+      summary = 2 | 16;
+   }
+   else if (INSN(27,24) == BITS4(0,0,0,1) && INSN(22,20) == BITS3(0,1,0)) {
+      summary = 2 | 32;
+   }
+   else if (INSN(27,24) == BITS4(0,0,0,0) && INSN(22,20) == BITS3(1,0,0)) {
+      summary = 3 | 16;
+   }
+   else if (INSN(27,24) == BITS4(0,0,0,0) && INSN(22,20) == BITS3(0,0,0)) {
+      summary = 3 | 32;
+   }
+   else goto after_load_store_doubleword;
+
+   { UInt rN   = (insn >> 16) & 0xF; /* 19:16 */
+     UInt rD   = (insn >> 12) & 0xF; /* 15:12 */
+     UInt rM   = (insn >> 0)  & 0xF; /*  3:0  */
+     UInt bU   = (insn >> 23) & 1;   /* 23 U=1 offset+, U=0 offset- */
+     UInt bS   = (insn >> 5) & 1;    /* S=1 store, S=0 load */
+     UInt imm8 = ((insn >> 4) & 0xF0) | (insn & 0xF); /* 11:8, 3:0 */
+
+     /* Require rD to be an even numbered register */
+     if ((rD & 1) != 0)
+        goto after_load_store_doubleword;
+
+     /* Require 11:8 == 0 for Rn +/- Rm cases */
+     if ((summary & 32) != 0 && (imm8 & 0xF0) != 0)
+        goto after_load_store_doubleword;
+
+     /* Skip some invalid cases, which would lead to two competing
+        updates to the same register, or which are otherwise
+        disallowed by the spec. */
+     switch (summary) {
+        case 1 | 16:
+           break;
+        case 1 | 32: 
+           if (rM == 15) goto after_load_store_doubleword;
+           break;
+        case 2 | 16: case 3 | 16:
+           if (rN == 15) goto after_load_store_doubleword;
+           if (bS == 0 && (rN == rD || rN == rD+1))
+              goto after_load_store_doubleword;
+           break;
+        case 2 | 32: case 3 | 32:
+           if (rM == 15) goto after_load_store_doubleword;
+           if (rN == 15) goto after_load_store_doubleword;
+           if (rN == rM) goto after_load_store_doubleword;
+           if (bS == 0 && (rN == rD || rN == rD+1))
+              goto after_load_store_doubleword;
+           break;
+        default:
+           vassert(0);
+     }
+
+     /* Now, we can't do a conditional load or store, since that very
+        likely will generate an exception.  So we have to take a side
+        exit at this point if the condition is false. */
+     if (condT != IRTemp_INVALID) {
+        mk_skip_over_A32_if_cond_is_false( condT );
+        condT = IRTemp_INVALID;
+     }
+     /* Ok, now we're unconditional.  Do the load or store. */
+
+     /* compute the effective address.  Bind it to a tmp since we
+        may need to use it twice. */
+     IRExpr* eaE = NULL;
+     switch (summary & 0xF0) {
+        case 16:
+           eaE = mk_EA_reg_plusminus_imm8( rN, bU, imm8, dis_buf );
+           break;
+        case 32:
+           eaE = mk_EA_reg_plusminus_reg( rN, bU, rM, dis_buf );
+           break;
+     }
+     vassert(eaE);
+     IRTemp eaT = newTemp(Ity_I32);
+     assign(eaT, eaE);
+
+     /* get the old Rn value */
+     IRTemp rnT = newTemp(Ity_I32);
+     assign(rnT, getIRegA(rN));
+
+     /* decide on the transfer address */
+     IRTemp taT = IRTemp_INVALID;
+     switch (summary & 0x0F) {
+        case 1: case 2: taT = eaT; break;
+        case 3:         taT = rnT; break;
+     }
+     vassert(taT != IRTemp_INVALID);
+
+     /* XXX deal with alignment constraints */
+     /* XXX: but the A8 doesn't seem to trap for misaligned loads, so,
+        ignore alignment issues for the time being. */
+
+     /* doubleword store  S 1
+        doubleword load   S 0
+     */
+     HChar* name = NULL;
+     /* generate the transfers */
+     if (bS == 1) { // doubleword store
+        storeLE( binop(Iop_Add32, mkexpr(taT), mkU32(0)), getIRegA(rD+0) );
+        storeLE( binop(Iop_Add32, mkexpr(taT), mkU32(4)), getIRegA(rD+1) );
+        name = "strd";
+     } else { // doubleword load
+        putIRegA( rD+0,
+                  loadLE(Ity_I32, binop(Iop_Add32, mkexpr(taT), mkU32(0))),
+                  IRTemp_INVALID, Ijk_Boring );
+        putIRegA( rD+1,
+                  loadLE(Ity_I32, binop(Iop_Add32, mkexpr(taT), mkU32(4))),
+                  IRTemp_INVALID, Ijk_Boring );
+        name = "ldrd";
+     }
+
+     /* Update Rn if necessary. */
+     switch (summary & 0x0F) {
+        case 2: case 3:
+           // should be assured by logic above:
+           if (bS == 0) {
+              vassert(rD+0 != rN); /* since we just wrote rD+0 */
+              vassert(rD+1 != rN); /* since we just wrote rD+1 */
+           }
+           putIRegA( rN, mkexpr(eaT), IRTemp_INVALID, Ijk_Boring );
+           break;
+     }
+
+     switch (summary & 0x0F) {
+        case 1:  DIP("%s%s r%u, %s\n", name, nCC(INSN_COND), rD, dis_buf);
+                 break;
+        case 2:  DIP("%s%s r%u, %s! (at-EA-then-Rn=EA)\n",
+                     name, nCC(INSN_COND), rD, dis_buf);
+                 break;
+        case 3:  DIP("%s%s r%u, %s! (at-Rn-then-Rn=EA)\n",
+                     name, nCC(INSN_COND), rD, dis_buf);
+                 break;
+        default: vassert(0);
+     }
+
+     goto decode_success;
+   }
+
+  after_load_store_doubleword:
+
+   /* ------------------- {s,u}xtab ------------- */
+   if (BITS8(0,1,1,0,1,0,1,0) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(0,0,0,0) == (INSN(11,8) & BITS4(0,0,1,1))
+       && BITS4(0,1,1,1) == INSN(7,4)) {
+      UInt rN  = INSN(19,16);
+      UInt rD  = INSN(15,12);
+      UInt rM  = INSN(3,0);
+      UInt rot = (insn >> 10) & 3;
+      UInt isU = INSN(22,22);
+      if (rN == 15/*it's {S,U}XTB*/ || rD == 15 || rM == 15) {
+         /* undecodable; fall through */
+      } else {
+         IRTemp srcL = newTemp(Ity_I32);
+         IRTemp srcR = newTemp(Ity_I32);
+         IRTemp res  = newTemp(Ity_I32);
+         assign(srcR, getIRegA(rM));
+         assign(srcL, getIRegA(rN));
+         assign(res,  binop(Iop_Add32,
+                            mkexpr(srcL),
+                            unop(isU ? Iop_8Uto32 : Iop_8Sto32,
+                                 unop(Iop_32to8, 
+                                      genROR32(srcR, 8 * rot)))));
+         putIRegA(rD, mkexpr(res), condT, Ijk_Boring);
+         DIP("%cxtab%s r%u, r%u, r%u, ror #%u\n",
+             isU ? 'u' : 's', nCC(INSN_COND), rD, rN, rM, rot);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   /* ------------------- {s,u}xtah ------------- */
+   if (BITS8(0,1,1,0,1,0,1,1) == (INSN(27,20) & BITS8(1,1,1,1,1,0,1,1))
+       && BITS4(0,0,0,0) == (INSN(11,8) & BITS4(0,0,1,1))
+       && BITS4(0,1,1,1) == INSN(7,4)) {
+      UInt rN  = INSN(19,16);
+      UInt rD  = INSN(15,12);
+      UInt rM  = INSN(3,0);
+      UInt rot = (insn >> 10) & 3;
+      UInt isU = INSN(22,22);
+      if (rN == 15/*it's {S,U}XTH*/ || rD == 15 || rM == 15) {
+         /* undecodable; fall through */
+      } else {
+         IRTemp srcL = newTemp(Ity_I32);
+         IRTemp srcR = newTemp(Ity_I32);
+         IRTemp res  = newTemp(Ity_I32);
+         assign(srcR, getIRegA(rM));
+         assign(srcL, getIRegA(rN));
+         assign(res,  binop(Iop_Add32,
+                            mkexpr(srcL),
+                            unop(isU ? Iop_16Uto32 : Iop_16Sto32,
+                                 unop(Iop_32to16, 
+                                      genROR32(srcR, 8 * rot)))));
+         putIRegA(rD, mkexpr(res), condT, Ijk_Boring);
+
+         DIP("%cxtah%s r%u, r%u, r%u, ror #%u\n",
+             isU ? 'u' : 's', nCC(INSN_COND), rD, rN, rM, rot);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   /* ------------------- rev16, rev ------------------ */
+   if (INSN(27,16) == 0x6BF
+       && (INSN(11,4) == 0xFB/*rev16*/ || INSN(11,4) == 0xF3/*rev*/)) {
+      Bool isREV = INSN(11,4) == 0xF3;
+      UInt rM    = INSN(3,0);
+      UInt rD    = INSN(15,12);
+      if (rM != 15 && rD != 15) {
+         IRTemp rMt = newTemp(Ity_I32);
+         assign(rMt, getIRegA(rM));
+         IRTemp res = isREV ? gen_REV(rMt) : gen_REV16(rMt);
+         putIRegA(rD, mkexpr(res), condT, Ijk_Boring);
+         DIP("rev%s%s r%u, r%u\n", isREV ? "" : "16",
+             nCC(INSN_COND), rD, rM);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- rbit ------------------ */
+   if (INSN(27,16) == 0x6FF && INSN(11,4) == 0xF3) {
+      UInt rD = INSN(15,12);
+      UInt rM = INSN(3,0);
+      if (rD != 15 && rM != 15) {
+         IRTemp arg = newTemp(Ity_I32);
+         assign(arg, getIRegA(rM));
+         IRTemp res = gen_BITREV(arg);
+         putIRegA(rD, mkexpr(res), condT, Ijk_Boring);
+         DIP("rbit r%u, r%u\n", rD, rM);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- smmul ------------------ */
+   if (INSN(27,20) == BITS8(0,1,1,1,0,1,0,1)
+       && INSN(15,12) == BITS4(1,1,1,1)
+       && (INSN(7,4) & BITS4(1,1,0,1)) == BITS4(0,0,0,1)) {
+      UInt bitR = INSN(5,5);
+      UInt rD = INSN(19,16);
+      UInt rM = INSN(11,8);
+      UInt rN = INSN(3,0);
+      if (rD != 15 && rM != 15 && rN != 15) {
+         IRExpr* res
+         = unop(Iop_64HIto32,
+                binop(Iop_Add64,
+                      binop(Iop_MullS32, getIRegA(rN), getIRegA(rM)),
+                      mkU64(bitR ? 0x80000000ULL : 0ULL)));
+         putIRegA(rD, res, condT, Ijk_Boring);
+         DIP("smmul%s%s r%u, r%u, r%u\n",
+             nCC(INSN_COND), bitR ? "r" : "", rD, rN, rM);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- NOP ------------------ */
+   if (0x0320F000 == (insn & 0x0FFFFFFF)) {
+      DIP("nop%s\n", nCC(INSN_COND));
+      goto decode_success;
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- ARMv7 instructions                                    -- */
+   /* ----------------------------------------------------------- */
+
+   /* -------------- read CP15 TPIDRURO register ------------- */
+   /* mrc     p15, 0, r0, c13, c0, 3  up to
+      mrc     p15, 0, r14, c13, c0, 3
+   */
+   /* I don't know whether this is really v7-only.  But anyway, we
+      have to support it since arm-linux uses TPIDRURO as a thread
+      state register. */
+   if (0x0E1D0F70 == (insn & 0x0FFF0FFF)) {
+      UInt rD = INSN(15,12);
+      if (rD <= 14) {
+         /* skip r15, that's too stupid to handle */
+         putIRegA(rD, IRExpr_Get(OFFB_TPIDRURO, Ity_I32),
+                      condT, Ijk_Boring);
+         DIP("mrc%s p15,0, r%u, c13, c0, 3\n", nCC(INSN_COND), rD);
+         goto decode_success;
+      }
+      /* fall through */
+   }
+
+   /* Handle various kinds of barriers.  This is rather indiscriminate
+      in the sense that they are all turned into an IR Fence, which
+      means we don't know which they are, so the back end has to
+      re-emit them all when it comes acrosss an IR Fence.
+   */
+   switch (insn) {
+      case 0xEE070F9A: /* v6 */
+         /* mcr 15, 0, r0, c7, c10, 4 (v6) equiv to DSB (v7).  Data
+            Synch Barrier -- ensures completion of memory accesses. */
+         stmt( IRStmt_MBE(Imbe_Fence) );
+         DIP("mcr 15, 0, r0, c7, c10, 4 (data synch barrier)\n");
+         goto decode_success;
+      case 0xEE070FBA: /* v6 */
+         /* mcr 15, 0, r0, c7, c10, 5 (v6) equiv to DMB (v7).  Data
+            Memory Barrier -- ensures ordering of memory accesses. */
+         stmt( IRStmt_MBE(Imbe_Fence) );
+         DIP("mcr 15, 0, r0, c7, c10, 5 (data memory barrier)\n");
+         goto decode_success;
+      case 0xEE070F95: /* v6 */
+         /* mcr 15, 0, r0, c7, c5, 4 (v6) equiv to ISB (v7).
+            Instruction Synchronisation Barrier (or Flush Prefetch
+            Buffer) -- a pipe flush, I think.  I suspect we could
+            ignore those, but to be on the safe side emit a fence
+            anyway. */
+         stmt( IRStmt_MBE(Imbe_Fence) );
+         DIP("mcr 15, 0, r0, c7, c5, 4 (insn synch barrier)\n");
+         goto decode_success;
+      default:
+         break;
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- VFP (CP 10, CP 11) instructions (in ARM mode)         -- */
+   /* ----------------------------------------------------------- */
+
+   if (INSN_COND != ARMCondNV) {
+      Bool ok_vfp = decode_CP10_CP11_instruction (
+                       &dres, INSN(27,0), condT, INSN_COND,
+                       False/*!isT*/
+                    );
+      if (ok_vfp)
+         goto decode_success;
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- NEON instructions (in ARM mode)                       -- */
+   /* ----------------------------------------------------------- */
+
+   /* These are all in NV space, and so are taken care of (far) above,
+      by a call from this function to decode_NV_instruction(). */
+
+   /* ----------------------------------------------------------- */
+   /* -- v6 media instructions (in ARM mode)                   -- */
+   /* ----------------------------------------------------------- */
+
+   { Bool ok_v6m = decode_V6MEDIA_instruction(
+                       &dres, INSN(27,0), condT, INSN_COND,
+                       False/*!isT*/
+                   );
+     if (ok_v6m)
+        goto decode_success;
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- Undecodable                                           -- */
+   /* ----------------------------------------------------------- */
+
+   goto decode_failure;
+   /*NOTREACHED*/
+
+  decode_failure:
+   /* All decode failures end up here. */
+   vex_printf("disInstr(arm): unhandled instruction: "
+              "0x%x\n", insn);
+   vex_printf("                 cond=%d(0x%x) 27:20=%u(0x%02x) "
+                                "4:4=%d "
+                                "3:0=%u(0x%x)\n",
+              (Int)INSN_COND, (UInt)INSN_COND,
+              (Int)INSN(27,20), (UInt)INSN(27,20),
+              (Int)INSN(4,4),
+              (Int)INSN(3,0), (UInt)INSN(3,0) );
+
+   /* Tell the dispatcher that this insn cannot be decoded, and so has
+      not been executed, and (is currently) the next to be executed.
+      R15 should be up-to-date since it made so at the start of each
+      insn, but nevertheless be paranoid and update it again right
+      now. */
+   vassert(0 == (guest_R15_curr_instr_notENC & 3));
+   llPutIReg( 15, mkU32(guest_R15_curr_instr_notENC) );
+   irsb->next     = mkU32(guest_R15_curr_instr_notENC);
+   irsb->jumpkind = Ijk_NoDecode;
+   dres.whatNext  = Dis_StopHere;
+   dres.len       = 0;
+   return dres;
+
+  decode_success:
+   /* All decode successes end up here. */
+   DIP("\n");
+
+   vassert(dres.len == 4 || dres.len == 20);
+
+   /* Now then.  Do we have an implicit jump to r15 to deal with? */
+   if (r15written) {
+      /* If we get jump to deal with, we assume that there's been no
+         other competing branch stuff previously generated for this
+         insn.  That's reasonable, in the sense that the ARM insn set
+         appears to declare as "Unpredictable" any instruction which
+         generates more than one possible new value for r15.  Hence
+         just assert.  The decoders themselves should check against
+         all such instructions which are thusly Unpredictable, and
+         decline to decode them.  Hence we should never get here if we
+         have competing new values for r15, and hence it is safe to
+         assert here. */
+      vassert(dres.whatNext == Dis_Continue);
+      vassert(irsb->next == NULL);
+      vassert(irsb->jumpkind = Ijk_Boring);
+      /* If r15 is unconditionally written, terminate the block by
+         jumping to it.  If it's conditionally written, still
+         terminate the block (a shame, but we can't do side exits to
+         arbitrary destinations), but first jump to the next
+         instruction if the condition doesn't hold. */
+      /* We can't use getIReg(15) to get the destination, since that
+         will produce r15+8, which isn't what we want.  Must use
+         llGetIReg(15) instead. */
+      if (r15guard == IRTemp_INVALID) {
+         /* unconditional */
+      } else {
+         /* conditional */
+         stmt( IRStmt_Exit(
+                  unop(Iop_32to1,
+                       binop(Iop_Xor32,
+                             mkexpr(r15guard), mkU32(1))),
+                  r15kind,
+                  IRConst_U32(guest_R15_curr_instr_notENC + 4)
+         ));
+      }
+      irsb->next     = llGetIReg(15);
+      irsb->jumpkind = r15kind;
+      dres.whatNext  = Dis_StopHere;
+   }
+
+   return dres;
+
+#  undef INSN_COND
+#  undef INSN
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Disassemble a single Thumb2 instruction              ---*/
+/*------------------------------------------------------------*/
+
+/* NB: in Thumb mode we do fetches of regs with getIRegT, which
+   automagically adds 4 to fetches of r15.  However, writes to regs
+   are done with putIRegT, which disallows writes to r15.  Hence any
+   r15 writes and associated jumps have to be done "by hand". */
+
+/* Disassemble a single Thumb instruction into IR.  The instruction is
+   located in host memory at guest_instr, and has (decoded) guest IP
+   of guest_R15_curr_instr_notENC, which will have been set before the
+   call here. */
+
+static   
+DisResult disInstr_THUMB_WRK (
+             Bool         put_IP,
+             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
+             Bool         resteerCisOk,
+             void*        callback_opaque,
+             UChar*       guest_instr,
+             VexArchInfo* archinfo,
+             VexAbiInfo*  abiinfo
+          )
+{
+   /* A macro to fish bits out of insn0.  There's also INSN1, to fish
+      bits out of insn1, but that's defined only after the end of the
+      16-bit insn decoder, so as to stop it mistakenly being used
+      therein. */
+#  define INSN0(_bMax,_bMin)  SLICE_UInt(((UInt)insn0), (_bMax), (_bMin))
+
+   DisResult dres;
+   UShort    insn0; /* first 16 bits of the insn */
+   //Bool      allow_VFP = False;
+   //UInt      hwcaps = archinfo->hwcaps;
+   HChar     dis_buf[128];  // big enough to hold LDMIA etc text
+
+   /* Summary result of the ITxxx backwards analysis: False == safe
+      but suboptimal. */
+   Bool guaranteedUnconditional = False;
+
+   /* What insn variants are we supporting today? */
+   //allow_VFP  = (0 != (hwcaps & VEX_HWCAPS_ARM_VFP));
+   // etc etc
+
+   /* Set result defaults. */
+   dres.whatNext   = Dis_Continue;
+   dres.len        = 2;
+   dres.continueAt = 0;
+
+   /* Set default actions for post-insn handling of writes to r15, if
+      required. */
+   r15written = False;
+   r15guard   = IRTemp_INVALID; /* unconditional */
+   r15kind    = Ijk_Boring;
+
+   /* Insns could be 2 or 4 bytes long.  Just get the first 16 bits at
+      this point.  If we need the second 16, get them later.  We can't
+      get them both out immediately because it risks a fault (very
+      unlikely, but ..) if the second 16 bits aren't actually
+      necessary. */
+   insn0 = getUShortLittleEndianly( guest_instr );
+
+   if (0) vex_printf("insn: 0x%x\n", insn0);
+
+   DIP("\t(thumb) 0x%x:  ", (UInt)guest_R15_curr_instr_notENC);
+
+   /* We may be asked to update the guest R15 before going further. */
+   vassert(0 == (guest_R15_curr_instr_notENC & 1));
+   if (put_IP) {
+      llPutIReg( 15, mkU32(guest_R15_curr_instr_notENC | 1) );
+   }
+
+   /* ----------------------------------------------------------- */
+   /* Spot "Special" instructions (see comment at top of file). */
+   {
+      UChar* code = (UChar*)guest_instr;
+      /* Spot the 16-byte preamble: 
+
+         ea4f 0cfc  mov.w   ip, ip, ror #3
+         ea4f 3c7c  mov.w   ip, ip, ror #13
+         ea4f 7c7c  mov.w   ip, ip, ror #29
+         ea4f 4cfc  mov.w   ip, ip, ror #19
+      */
+      UInt word1 = 0x0CFCEA4F;
+      UInt word2 = 0x3C7CEA4F;
+      UInt word3 = 0x7C7CEA4F;
+      UInt word4 = 0x4CFCEA4F;
+      if (getUIntLittleEndianly(code+ 0) == word1 &&
+          getUIntLittleEndianly(code+ 4) == word2 &&
+          getUIntLittleEndianly(code+ 8) == word3 &&
+          getUIntLittleEndianly(code+12) == word4) {
+         /* Got a "Special" instruction preamble.  Which one is it? */
+         // 0x 0A 0A EA 4A
+         if (getUIntLittleEndianly(code+16) == 0x0A0AEA4A
+                                               /* orr.w r10,r10,r10 */) {
+            /* R3 = client_request ( R4 ) */
+            DIP("r3 = client_request ( %%r4 )\n");
+            irsb->next     = mkU32( (guest_R15_curr_instr_notENC + 20) | 1 );
+            irsb->jumpkind = Ijk_ClientReq;
+            dres.whatNext  = Dis_StopHere;
+            goto decode_success;
+         }
+         else
+         // 0x 0B 0B EA 4B
+         if (getUIntLittleEndianly(code+16) == 0x0B0BEA4B
+                                               /* orr r11,r11,r11 */) {
+            /* R3 = guest_NRADDR */
+            DIP("r3 = guest_NRADDR\n");
+            dres.len = 20;
+            llPutIReg(3, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
+            goto decode_success;
+         }
+         else
+         // 0x 0C 0C EA 4C
+         if (getUIntLittleEndianly(code+16) == 0x0C0CEA4C
+                                               /* orr r12,r12,r12 */) {
+            /*  branch-and-link-to-noredir R4 */
+            DIP("branch-and-link-to-noredir r4\n");
+            llPutIReg(14, mkU32( (guest_R15_curr_instr_notENC + 20) | 1 ));
+            irsb->next     = getIRegT(4);
+            irsb->jumpkind = Ijk_NoRedir;
+            dres.whatNext  = Dis_StopHere;
+            goto decode_success;
+         }
+         /* We don't know what it is.  Set insn0 so decode_failure
+            can print the insn following the Special-insn preamble. */
+         insn0 = getUShortLittleEndianly(code+16);
+         goto decode_failure;
+         /*NOTREACHED*/
+      }
+
+   }
+
+   /* ----------------------------------------------------------- */
+
+   /* Main Thumb instruction decoder starts here.  It's a series of
+      switches which examine ever longer bit sequences at the MSB of
+      the instruction word, first for 16-bit insns, then for 32-bit
+      insns. */
+
+   /* --- BEGIN ITxxx optimisation analysis --- */
+   /* This is a crucial optimisation for the ITState boilerplate that
+      follows.  Examine the 9 halfwords preceding this instruction,
+      and if we are absolutely sure that none of them constitute an
+      'it' instruction, then we can be sure that this instruction is
+      not under the control of any 'it' instruction, and so
+      guest_ITSTATE must be zero.  So write zero into ITSTATE right
+      now, so that iropt can fold out almost all of the resulting
+      junk.
+
+      If we aren't sure, we can always safely skip this step.  So be a
+      bit conservative about it: only poke around in the same page as
+      this instruction, lest we get a fault from the previous page
+      that would not otherwise have happened.  The saving grace is
+      that such skipping is pretty rare -- it only happens,
+      statistically, 18/4096ths of the time, so is judged unlikely to
+      be a performance problems.
+
+      FIXME: do better.  Take into account the number of insns covered
+      by any IT insns we find, to rule out cases where an IT clearly
+      cannot cover this instruction.  This would improve behaviour for
+      branch targets immediately following an IT-guarded group that is
+      not of full length.  Eg, (and completely ignoring issues of 16-
+      vs 32-bit insn length):
+
+             ite cond
+             insn1
+             insn2
+      label: insn3
+             insn4
+
+      The 'it' only conditionalises insn1 and insn2.  However, the
+      current analysis is conservative and considers insn3 and insn4
+      also possibly guarded.  Hence if 'label:' is the start of a hot
+      loop we will get a big performance hit.
+   */
+   {
+      /* Summary result of this analysis: False == safe but
+         suboptimal. */
+      vassert(guaranteedUnconditional == False);
+
+      UInt pc = guest_R15_curr_instr_notENC;
+      vassert(0 == (pc & 1));
+
+      UInt pageoff = pc & 0xFFF;
+      if (pageoff >= 18) {
+         /* It's safe to poke about in the 9 halfwords preceding this
+            insn.  So, have a look at them. */
+         guaranteedUnconditional = True; /* assume no 'it' insn found, till we do */
+
+         UShort* hwp = (UShort*)(HWord)pc;
+         Int i;
+         for (i = -1; i >= -9; i--) {
+            /* We're in the same page.  (True, but commented out due
+               to expense.) */
+            /*
+            vassert( ( ((UInt)(&hwp[i])) & 0xFFFFF000 )
+                      == ( pc & 0xFFFFF000 ) );
+            */
+            /* All valid IT instructions must have the form 0xBFxy,
+               where x can be anything, but y must be nonzero. */
+            if ((hwp[i] & 0xFF00) == 0xBF00 && (hwp[i] & 0xF) != 0) {
+               /* might be an 'it' insn.  Play safe. */
+               guaranteedUnconditional = False;
+               break;
+            }
+         }
+      }
+   }
+   /* --- END ITxxx optimisation analysis --- */
+
+   /* Generate the guarding condition for this insn, by examining
+      ITSTATE.  Assign it to condT.  Also, generate new
+      values for ITSTATE ready for stuffing back into the
+      guest state, but don't actually do the Put yet, since it will
+      need to stuffed back in only after the instruction gets to a
+      point where it is sure to complete.  Mostly we let the code at
+      decode_success handle this, but in cases where the insn contains
+      a side exit, we have to update them before the exit. */
+
+   /* If the ITxxx optimisation analysis above could not prove that
+      this instruction is guaranteed unconditional, we insert a
+      lengthy IR preamble to compute the guarding condition at
+      runtime.  If it can prove it (which obviously we hope is the
+      normal case) then we insert a minimal preamble, which is
+      equivalent to setting guest_ITSTATE to zero and then folding
+      that through the full preamble (which completely disappears). */
+
+   IRTemp condT              = IRTemp_INVALID;
+   IRTemp old_itstate        = IRTemp_INVALID;
+   IRTemp new_itstate        = IRTemp_INVALID;
+   IRTemp cond_AND_notInIT_T = IRTemp_INVALID;
+
+   if (guaranteedUnconditional) {
+      /* BEGIN "partial eval { ITSTATE = 0; STANDARD_PREAMBLE; }" */
+
+      // ITSTATE = 0 :: I32
+      IRTemp z32 = newTemp(Ity_I32);
+      assign(z32, mkU32(0));
+      put_ITSTATE(z32);
+
+      // old_itstate = 0 :: I32
+      //
+      // old_itstate = get_ITSTATE();
+      old_itstate = z32; /* 0 :: I32 */
+
+      // new_itstate = old_itstate >> 8
+      //             = 0 >> 8
+      //             = 0 :: I32
+      //
+      // new_itstate = newTemp(Ity_I32);
+      // assign(new_itstate,
+      //        binop(Iop_Shr32, mkexpr(old_itstate), mkU8(8)));
+      new_itstate = z32;
+
+      // ITSTATE = 0 :: I32(again)
+      //
+      // put_ITSTATE(new_itstate);
+
+      // condT1 = calc_cond_dyn( xor(and(old_istate,0xF0), 0xE0) )
+      //        = calc_cond_dyn( xor(0,0xE0) )
+      //        = calc_cond_dyn ( 0xE0 )
+      //        = 1 :: I32
+      // Not that this matters, since the computed value is not used:
+      // see condT folding below
+      //
+      // IRTemp condT1 = newTemp(Ity_I32);
+      // assign(condT1,
+      //        mk_armg_calculate_condition_dyn(
+      //           binop(Iop_Xor32,
+      //                 binop(Iop_And32, mkexpr(old_itstate), mkU32(0xF0)),
+      //                 mkU32(0xE0))
+      //       )
+      // );
+
+      // condT = 32to8(and32(old_itstate,0xF0)) == 0  ? 1  : condT1
+      //       = 32to8(and32(0,0xF0)) == 0  ? 1  : condT1
+      //       = 32to8(0) == 0  ? 1  : condT1
+      //       = 0 == 0  ? 1  : condT1
+      //       = 1
+      //
+      // condT = newTemp(Ity_I32);
+      // assign(condT, IRExpr_Mux0X(
+      //                  unop(Iop_32to8, binop(Iop_And32,
+      //                                        mkexpr(old_itstate),
+      //                                        mkU32(0xF0))),
+      //                  mkU32(1),
+      //                  mkexpr(condT1)
+      //       ));
+      condT = newTemp(Ity_I32);
+      assign(condT, mkU32(1));
+
+      // notInITt = xor32(and32(old_itstate, 1), 1)
+      //          = xor32(and32(0, 1), 1)
+      //          = xor32(0, 1)
+      //          = 1 :: I32
+      //
+      // IRTemp notInITt = newTemp(Ity_I32);
+      // assign(notInITt,
+      //        binop(Iop_Xor32,
+      //              binop(Iop_And32, mkexpr(old_itstate), mkU32(1)),
+      //              mkU32(1)));
+
+      // cond_AND_notInIT_T = and32(notInITt, condT)
+      //                    = and32(1, 1)
+      //                    = 1
+      //
+      // cond_AND_notInIT_T = newTemp(Ity_I32);
+      // assign(cond_AND_notInIT_T,
+      //        binop(Iop_And32, mkexpr(notInITt), mkexpr(condT)));
+      cond_AND_notInIT_T = condT; /* 1 :: I32 */
+
+      /* END "partial eval { ITSTATE = 0; STANDARD_PREAMBLE; }" */
+   } else {
+      /* BEGIN { STANDARD PREAMBLE; } */
+
+      old_itstate = get_ITSTATE();
+
+      new_itstate = newTemp(Ity_I32);
+      assign(new_itstate,
+             binop(Iop_Shr32, mkexpr(old_itstate), mkU8(8)));
+
+      put_ITSTATE(new_itstate);
+
+      /* Same strategy as for ARM insns: generate a condition
+         temporary at this point (or IRTemp_INVALID, meaning
+         unconditional).  We leave it to lower-level instruction
+         decoders to decide whether they can generate straight-line
+         code, or whether they must generate a side exit before the
+         instruction.  condT :: Ity_I32 and is always either zero or
+         one. */
+      IRTemp condT1 = newTemp(Ity_I32);
+      assign(condT1,
+             mk_armg_calculate_condition_dyn(
+                binop(Iop_Xor32,
+                      binop(Iop_And32, mkexpr(old_itstate), mkU32(0xF0)),
+                      mkU32(0xE0))
+            )
+      );
+
+      /* This is a bit complex, but needed to make Memcheck understand
+         that, if the condition in old_itstate[7:4] denotes AL (that
+         is, if this instruction is to be executed unconditionally),
+         then condT does not depend on the results of calling the
+         helper.
+
+         We test explicitly for old_itstate[7:4] == AL ^ 0xE, and in
+         that case set condT directly to 1.  Else we use the results
+         of the helper.  Since old_itstate is always defined and
+         because Memcheck does lazy V-bit propagation through Mux0X,
+         this will cause condT to always be a defined 1 if the
+         condition is 'AL'.  From an execution semantics point of view
+         this is irrelevant since we're merely duplicating part of the
+         behaviour of the helper.  But it makes it clear to Memcheck,
+         in this case, that condT does not in fact depend on the
+         contents of the condition code thunk.  Without it, we get
+         quite a lot of false errors.
+
+         So, just to clarify: from a straight semantics point of view,
+         we can simply do "assign(condT, mkexpr(condT1))", and the
+         simulator still runs fine.  It's just that we get loads of
+         false errors from Memcheck. */
+      condT = newTemp(Ity_I32);
+      assign(condT, IRExpr_Mux0X(
+                       unop(Iop_32to8, binop(Iop_And32,
+                                             mkexpr(old_itstate),
+                                             mkU32(0xF0))),
+                       mkU32(1),
+                       mkexpr(condT1)
+            ));
+
+      /* Something we don't have in ARM: generate a 0 or 1 value
+         indicating whether or not we are in an IT block (NB: 0 = in
+         IT block, 1 = not in IT block).  This is used to gate
+         condition code updates in 16-bit Thumb instructions. */
+      IRTemp notInITt = newTemp(Ity_I32);
+      assign(notInITt,
+             binop(Iop_Xor32,
+                   binop(Iop_And32, mkexpr(old_itstate), mkU32(1)),
+                   mkU32(1)));
+
+      /* Compute 'condT && notInITt' -- that is, the instruction is
+         going to execute, and we're not in an IT block.  This is the
+         gating condition for updating condition codes in 16-bit Thumb
+         instructions, except for CMP, CMN and TST. */
+      cond_AND_notInIT_T = newTemp(Ity_I32);
+      assign(cond_AND_notInIT_T,
+             binop(Iop_And32, mkexpr(notInITt), mkexpr(condT)));
+      /* END { STANDARD PREAMBLE; } */
+   }
+
+
+   /* At this point:
+      * ITSTATE has been updated
+      * condT holds the guarding condition for this instruction (0 or 1),
+      * notInITt is 1 if we're in "normal" code, 0 if in an IT block
+      * cond_AND_notInIT_T is the AND of the above two.
+
+      If the instruction proper can't trap, then there's nothing else
+      to do w.r.t. ITSTATE -- just go and and generate IR for the
+      insn, taking into account the guarding condition.
+
+      If, however, the instruction might trap, then we must back up
+      ITSTATE to the old value, and re-update it after the potentially
+      trapping IR section.  A trap can happen either via a memory
+      reference or because we need to throw SIGILL.
+
+      If an instruction has a side exit, we need to be sure that any
+      ITSTATE backup is re-updated before the side exit.
+   */
+
+   /* ----------------------------------------------------------- */
+   /* --                                                       -- */
+   /* -- Thumb 16-bit integer instructions                     -- */
+   /* --                                                       -- */
+   /* -- IMPORTANT: references to insn1 or INSN1 are           -- */
+   /* --            not allowed in this section                -- */
+   /* --                                                       -- */
+   /* ----------------------------------------------------------- */
+
+   /* 16-bit instructions inside an IT block, apart from CMP, CMN and
+      TST, do not set the condition codes.  Hence we must dynamically
+      test for this case for every condition code update. */
+
+   IROp   anOp   = Iop_INVALID;
+   HChar* anOpNm = NULL;
+
+   /* ================ 16-bit 15:6 cases ================ */
+
+   switch (INSN0(15,6)) {
+
+   case 0x10a:   // CMP
+   case 0x10b: { // CMN
+      /* ---------------- CMP Rn, Rm ---------------- */
+      Bool   isCMN = INSN0(15,6) == 0x10b;
+      UInt   rN    = INSN0(2,0);
+      UInt   rM    = INSN0(5,3);
+      IRTemp argL  = newTemp(Ity_I32);
+      IRTemp argR  = newTemp(Ity_I32);
+      assign( argL, getIRegT(rN) );
+      assign( argR, getIRegT(rM) );
+      /* Update flags regardless of whether in an IT block or not. */
+      setFlags_D1_D2( isCMN ? ARMG_CC_OP_ADD : ARMG_CC_OP_SUB,
+                      argL, argR, condT );
+      DIP("%s r%u, r%u\n", isCMN ? "cmn" : "cmp", rN, rM);
+      goto decode_success;
+   }
+
+   case 0x108: {
+      /* ---------------- TST Rn, Rm ---------------- */
+      UInt   rN   = INSN0(2,0);
+      UInt   rM   = INSN0(5,3);
+      IRTemp oldC = newTemp(Ity_I32);
+      IRTemp oldV = newTemp(Ity_I32);
+      IRTemp res  = newTemp(Ity_I32);
+      assign( oldC, mk_armg_calculate_flag_c() );
+      assign( oldV, mk_armg_calculate_flag_v() );
+      assign( res,  binop(Iop_And32, getIRegT(rN), getIRegT(rM)) );
+      /* Update flags regardless of whether in an IT block or not. */
+      setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, oldC, oldV, condT );
+      DIP("tst r%u, r%u\n", rN, rM);
+      goto decode_success;
+   }
+
+   case 0x109: {
+      /* ---------------- NEGS Rd, Rm ---------------- */
+      /* Rd = -Rm */
+      UInt   rM   = INSN0(5,3);
+      UInt   rD   = INSN0(2,0);
+      IRTemp arg  = newTemp(Ity_I32);
+      IRTemp zero = newTemp(Ity_I32);
+      assign(arg, getIRegT(rM));
+      assign(zero, mkU32(0));
+      // rD can never be r15
+      putIRegT(rD, binop(Iop_Sub32, mkexpr(zero), mkexpr(arg)), condT);
+      setFlags_D1_D2( ARMG_CC_OP_SUB, zero, arg, cond_AND_notInIT_T);
+      DIP("negs r%u, r%u\n", rD, rM);
+      goto decode_success;
+   }
+
+   case 0x10F: {
+      /* ---------------- MVNS Rd, Rm ---------------- */
+      /* Rd = ~Rm */
+      UInt   rM   = INSN0(5,3);
+      UInt   rD   = INSN0(2,0);
+      IRTemp oldV = newTemp(Ity_I32);
+      IRTemp oldC = newTemp(Ity_I32);
+      IRTemp res  = newTemp(Ity_I32);
+      assign( oldV, mk_armg_calculate_flag_v() );
+      assign( oldC, mk_armg_calculate_flag_c() );
+      assign(res, unop(Iop_Not32, getIRegT(rM)));
+      // rD can never be r15
+      putIRegT(rD, mkexpr(res), condT);
+      setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, oldC, oldV,
+                         cond_AND_notInIT_T );
+      DIP("mvns r%u, r%u\n", rD, rM);
+      goto decode_success;
+   }
+
+   case 0x10C:
+      /* ---------------- ORRS Rd, Rm ---------------- */
+      anOp = Iop_Or32; anOpNm = "orr"; goto and_orr_eor_mul;
+   case 0x100:
+      /* ---------------- ANDS Rd, Rm ---------------- */
+      anOp = Iop_And32; anOpNm = "and"; goto and_orr_eor_mul;
+   case 0x101:
+      /* ---------------- EORS Rd, Rm ---------------- */
+      anOp = Iop_Xor32; anOpNm = "eor"; goto and_orr_eor_mul;
+   case 0x10d:
+      /* ---------------- MULS Rd, Rm ---------------- */
+      anOp = Iop_Mul32; anOpNm = "mul"; goto and_orr_eor_mul;
+   and_orr_eor_mul: {
+      /* Rd = Rd `op` Rm */
+      UInt   rM   = INSN0(5,3);
+      UInt   rD   = INSN0(2,0);
+      IRTemp res  = newTemp(Ity_I32);
+      IRTemp oldV = newTemp(Ity_I32);
+      IRTemp oldC = newTemp(Ity_I32);
+      assign( oldV, mk_armg_calculate_flag_v() );
+      assign( oldC, mk_armg_calculate_flag_c() );
+      assign( res, binop(anOp, getIRegT(rD), getIRegT(rM) ));
+      // not safe to read guest state after here
+      // rD can never be r15
+      putIRegT(rD, mkexpr(res), condT);
+      setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, oldC, oldV,
+                         cond_AND_notInIT_T );
+      DIP("%s r%u, r%u\n", anOpNm, rD, rM);
+      goto decode_success;
+   }
+
+   case 0x10E: {
+      /* ---------------- BICS Rd, Rm ---------------- */
+      /* Rd = Rd & ~Rm */
+      UInt   rM   = INSN0(5,3);
+      UInt   rD   = INSN0(2,0);
+      IRTemp res  = newTemp(Ity_I32);
+      IRTemp oldV = newTemp(Ity_I32);
+      IRTemp oldC = newTemp(Ity_I32);
+      assign( oldV, mk_armg_calculate_flag_v() );
+      assign( oldC, mk_armg_calculate_flag_c() );
+      assign( res, binop(Iop_And32, getIRegT(rD),
+                                    unop(Iop_Not32, getIRegT(rM) )));
+      // not safe to read guest state after here
+      // rD can never be r15
+      putIRegT(rD, mkexpr(res), condT);
+      setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, oldC, oldV,
+                         cond_AND_notInIT_T );
+      DIP("bics r%u, r%u\n", rD, rM);
+      goto decode_success;
+   }
+
+   case 0x105: {
+      /* ---------------- ADCS Rd, Rm ---------------- */
+      /* Rd = Rd + Rm + oldC */
+      UInt   rM   = INSN0(5,3);
+      UInt   rD   = INSN0(2,0);
+      IRTemp argL = newTemp(Ity_I32);
+      IRTemp argR = newTemp(Ity_I32);
+      IRTemp oldC = newTemp(Ity_I32);
+      IRTemp res  = newTemp(Ity_I32);
+      assign(argL, getIRegT(rD));
+      assign(argR, getIRegT(rM));
+      assign(oldC, mk_armg_calculate_flag_c());
+      assign(res, binop(Iop_Add32,
+                        binop(Iop_Add32, mkexpr(argL), mkexpr(argR)),
+                        mkexpr(oldC)));
+      // rD can never be r15
+      putIRegT(rD, mkexpr(res), condT);
+      setFlags_D1_D2_ND( ARMG_CC_OP_ADC, argL, argR, oldC,
+                         cond_AND_notInIT_T );
+      DIP("adcs r%u, r%u\n", rD, rM);
+      goto decode_success;
+   }
+
+   case 0x106: {
+      /* ---------------- SBCS Rd, Rm ---------------- */
+      /* Rd = Rd - Rm - (oldC ^ 1) */
+      UInt   rM   = INSN0(5,3);
+      UInt   rD   = INSN0(2,0);
+      IRTemp argL = newTemp(Ity_I32);
+      IRTemp argR = newTemp(Ity_I32);
+      IRTemp oldC = newTemp(Ity_I32);
+      IRTemp res  = newTemp(Ity_I32);
+      assign(argL, getIRegT(rD));
+      assign(argR, getIRegT(rM));
+      assign(oldC, mk_armg_calculate_flag_c());
+      assign(res, binop(Iop_Sub32,
+                        binop(Iop_Sub32, mkexpr(argL), mkexpr(argR)),
+                        binop(Iop_Xor32, mkexpr(oldC), mkU32(1))));
+      // rD can never be r15
+      putIRegT(rD, mkexpr(res), condT);
+      setFlags_D1_D2_ND( ARMG_CC_OP_SBB, argL, argR, oldC,
+                         cond_AND_notInIT_T );
+      DIP("sbcs r%u, r%u\n", rD, rM);
+      goto decode_success;
+   }
+
+   case 0x2CB: {
+      /* ---------------- UXTB Rd, Rm ---------------- */
+      /* Rd = 8Uto32(Rm) */
+      UInt rM = INSN0(5,3);
+      UInt rD = INSN0(2,0);
+      putIRegT(rD, binop(Iop_And32, getIRegT(rM), mkU32(0xFF)),
+                   condT);
+      DIP("uxtb r%u, r%u\n", rD, rM);
+      goto decode_success;
+   }
+
+   case 0x2C9: {
+      /* ---------------- SXTB Rd, Rm ---------------- */
+      /* Rd = 8Sto32(Rm) */
+      UInt rM = INSN0(5,3);
+      UInt rD = INSN0(2,0);
+      putIRegT(rD, binop(Iop_Sar32,
+                         binop(Iop_Shl32, getIRegT(rM), mkU8(24)),
+                         mkU8(24)),
+                   condT);
+      DIP("sxtb r%u, r%u\n", rD, rM);
+      goto decode_success;
+   }
+
+   case 0x2CA: {
+      /* ---------------- UXTH Rd, Rm ---------------- */
+      /* Rd = 16Uto32(Rm) */
+      UInt rM = INSN0(5,3);
+      UInt rD = INSN0(2,0);
+      putIRegT(rD, binop(Iop_And32, getIRegT(rM), mkU32(0xFFFF)),
+                   condT);
+      DIP("uxth r%u, r%u\n", rD, rM);
+      goto decode_success;
+   }
+
+   case 0x2C8: {
+      /* ---------------- SXTH Rd, Rm ---------------- */
+      /* Rd = 16Sto32(Rm) */
+      UInt rM = INSN0(5,3);
+      UInt rD = INSN0(2,0);
+      putIRegT(rD, binop(Iop_Sar32,
+                         binop(Iop_Shl32, getIRegT(rM), mkU8(16)),
+                         mkU8(16)),
+                   condT);
+      DIP("sxth r%u, r%u\n", rD, rM);
+      goto decode_success;
+   }
+
+   case 0x102:   // LSLS
+   case 0x103:   // LSRS
+   case 0x104:   // ASRS
+   case 0x107: { // RORS
+      /* ---------------- LSLS Rs, Rd ---------------- */
+      /* ---------------- LSRS Rs, Rd ---------------- */
+      /* ---------------- ASRS Rs, Rd ---------------- */
+      /* ---------------- RORS Rs, Rd ---------------- */
+      /* Rd = Rd `op` Rs, and set flags */
+      UInt   rS   = INSN0(5,3);
+      UInt   rD   = INSN0(2,0);
+      IRTemp oldV = newTemp(Ity_I32);
+      IRTemp rDt  = newTemp(Ity_I32);
+      IRTemp rSt  = newTemp(Ity_I32);
+      IRTemp res  = newTemp(Ity_I32);
+      IRTemp resC = newTemp(Ity_I32);
+      HChar* wot  = "???";
+      assign(rSt, getIRegT(rS));
+      assign(rDt, getIRegT(rD));
+      assign(oldV, mk_armg_calculate_flag_v());
+      /* Does not appear to be the standard 'how' encoding. */
+      switch (INSN0(15,6)) {
+         case 0x102:
+            compute_result_and_C_after_LSL_by_reg(
+               dis_buf, &res, &resC, rDt, rSt, rD, rS
+            );
+            wot = "lsl";
+            break;
+         case 0x103:
+            compute_result_and_C_after_LSR_by_reg(
+               dis_buf, &res, &resC, rDt, rSt, rD, rS
+            );
+            wot = "lsr";
+            break;
+         case 0x104:
+            compute_result_and_C_after_ASR_by_reg(
+               dis_buf, &res, &resC, rDt, rSt, rD, rS
+            );
+            wot = "asr";
+            break;
+         case 0x107:
+            compute_result_and_C_after_ROR_by_reg(
+               dis_buf, &res, &resC, rDt, rSt, rD, rS
+            );
+            wot = "ror";
+            break;
+         default:
+            /*NOTREACHED*/vassert(0);
+      }
+      // not safe to read guest state after this point
+      putIRegT(rD, mkexpr(res), condT);
+      setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, resC, oldV,
+                         cond_AND_notInIT_T );
+      DIP("%ss r%u, r%u\n", wot, rS, rD);
+      goto decode_success;
+   }
+
+   case 0x2E8:   // REV
+   case 0x2E9: { // REV16
+      /* ---------------- REV   Rd, Rm ---------------- */
+      /* ---------------- REV16 Rd, Rm ---------------- */
+      UInt rM = INSN0(5,3);
+      UInt rD = INSN0(2,0);
+      Bool isREV = INSN0(15,6) == 0x2E8;
+      IRTemp arg = newTemp(Ity_I32);
+      assign(arg, getIRegT(rM));
+      IRTemp res = isREV ? gen_REV(arg) : gen_REV16(arg);
+      putIRegT(rD, mkexpr(res), condT);
+      DIP("rev%s r%u, r%u\n", isREV ? "" : "16", rD, rM);
+      goto decode_success;
+   }
+
+   default:
+      break; /* examine the next shortest prefix */
+
+   }
+
+
+   /* ================ 16-bit 15:7 cases ================ */
+
+   switch (INSN0(15,7)) {
+
+   case BITS9(1,0,1,1,0,0,0,0,0): {
+      /* ------------ ADD SP, #imm7 * 4 ------------ */
+      UInt uimm7 = INSN0(6,0);
+      putIRegT(13, binop(Iop_Add32, getIRegT(13), mkU32(uimm7 * 4)),
+                   condT);
+      DIP("add sp, #%u\n", uimm7 * 4);
+      goto decode_success;
+   }
+
+   case BITS9(1,0,1,1,0,0,0,0,1): {
+      /* ------------ SUB SP, #imm7 * 4 ------------ */
+      UInt uimm7 = INSN0(6,0);
+      putIRegT(13, binop(Iop_Sub32, getIRegT(13), mkU32(uimm7 * 4)),
+                   condT);
+      DIP("sub sp, #%u\n", uimm7 * 4);
+      goto decode_success;
+   }
+
+   case BITS9(0,1,0,0,0,1,1,1,0): {
+      /* ---------------- BX rM ---------------- */
+      /* Branch to reg, and optionally switch modes.  Reg contains a
+         suitably encoded address therefore (w CPSR.T at the bottom).
+         Have to special-case r15, as usual. */
+      UInt rM = (INSN0(6,6) << 3) | INSN0(5,3);
+      if (BITS3(0,0,0) == INSN0(2,0)) {
+         IRTemp dst = newTemp(Ity_I32);
+         gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
+         mk_skip_over_T16_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+         // now uncond
+         if (rM <= 14) {
+            assign( dst, getIRegT(rM) );
+         } else {
+            vassert(rM == 15);
+            assign( dst, mkU32(guest_R15_curr_instr_notENC + 4) );
+         }
+         irsb->next     = mkexpr(dst);
+         irsb->jumpkind = Ijk_Boring;
+         dres.whatNext  = Dis_StopHere;
+         DIP("bx r%u (possibly switch to ARM mode)\n", rM);
+         goto decode_success;
+      }
+      break;
+   }
+
+   /* ---------------- BLX rM ---------------- */
+   /* Branch and link to interworking address in rM. */
+   case BITS9(0,1,0,0,0,1,1,1,1): {
+      if (BITS3(0,0,0) == INSN0(2,0)) {
+         UInt rM = (INSN0(6,6) << 3) | INSN0(5,3);
+         IRTemp dst = newTemp(Ity_I32);
+         if (rM <= 14) {
+            gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
+            mk_skip_over_T16_if_cond_is_false(condT);
+            condT = IRTemp_INVALID;
+            // now uncond
+            /* We're returning to Thumb code, hence "| 1" */
+            assign( dst, getIRegT(rM) );
+            putIRegT( 14, mkU32( (guest_R15_curr_instr_notENC + 2) | 1 ),
+                          IRTemp_INVALID );
+            irsb->next     = mkexpr(dst);
+            irsb->jumpkind = Ijk_Boring;
+            dres.whatNext  = Dis_StopHere;
+            DIP("blx r%u (possibly switch to ARM mode)\n", rM);
+            goto decode_success;
+         }
+         /* else unpredictable, fall through */
+      }
+      break;
+   }
+
+   default:
+      break; /* examine the next shortest prefix */
+
+   }
+
+
+   /* ================ 16-bit 15:8 cases ================ */
+
+   switch (INSN0(15,8)) {
+
+   case BITS8(1,1,0,1,1,1,1,1): {
+      /* ---------------- SVC ---------------- */
+      UInt imm8 = INSN0(7,0);
+      if (imm8 == 0) {
+         /* A syscall.  We can't do this conditionally, hence: */
+         mk_skip_over_T16_if_cond_is_false( condT );
+         // FIXME: what if we have to back up and restart this insn?
+         // then ITSTATE will be wrong (we'll have it as "used")
+         // when it isn't.  Correct is to save ITSTATE in a 
+         // stash pseudo-reg, and back up from that if we have to
+         // restart.
+         // uncond after here
+         irsb->next     = mkU32( (guest_R15_curr_instr_notENC + 2) | 1 );
+         irsb->jumpkind = Ijk_Sys_syscall;
+         dres.whatNext  = Dis_StopHere;
+         DIP("svc #0x%08x\n", imm8);
+         goto decode_success;
+      }
+      /* else fall through */
+      break;
+   }
+
+   case BITS8(0,1,0,0,0,1,0,0): {
+      /* ---------------- ADD(HI) Rd, Rm ---------------- */
+      UInt h1 = INSN0(7,7);
+      UInt h2 = INSN0(6,6);
+      UInt rM = (h2 << 3) | INSN0(5,3);
+      UInt rD = (h1 << 3) | INSN0(2,0);
+      //if (h1 == 0 && h2 == 0) { // Original T1 was more restrictive
+      if (rD == 15 && rM == 15) {
+         // then it's invalid
+      } else {
+         IRTemp res = newTemp(Ity_I32);
+         assign( res, binop(Iop_Add32, getIRegT(rD), getIRegT(rM) ));
+         if (rD != 15) {
+            putIRegT( rD, mkexpr(res), condT );
+         } else {
+            /* Only allowed outside or last-in IT block; SIGILL if not so. */
+            gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
+            /* jump over insn if not selected */
+            mk_skip_over_T16_if_cond_is_false(condT);
+            condT = IRTemp_INVALID;
+            // now uncond
+            /* non-interworking branch */
+            irsb->next = binop(Iop_Or32, mkexpr(res), mkU32(1));
+            irsb->jumpkind = Ijk_Boring;
+            dres.whatNext = Dis_StopHere;
+         }
+         DIP("add(hi) r%u, r%u\n", rD, rM);
+         goto decode_success;
+      }
+      break;
+   }
+
+   case BITS8(0,1,0,0,0,1,0,1): {
+      /* ---------------- CMP(HI) Rd, Rm ---------------- */
+      UInt h1 = INSN0(7,7);
+      UInt h2 = INSN0(6,6);
+      UInt rM = (h2 << 3) | INSN0(5,3);
+      UInt rN = (h1 << 3) | INSN0(2,0);
+      if (h1 != 0 || h2 != 0) {
+         IRTemp argL  = newTemp(Ity_I32);
+         IRTemp argR  = newTemp(Ity_I32);
+         assign( argL, getIRegT(rN) );
+         assign( argR, getIRegT(rM) );
+         /* Update flags regardless of whether in an IT block or not. */
+         setFlags_D1_D2( ARMG_CC_OP_SUB, argL, argR, condT );
+         DIP("cmphi r%u, r%u\n", rN, rM);
+         goto decode_success;
+      }
+      break;
+   }
+
+   case BITS8(0,1,0,0,0,1,1,0): {
+      /* ---------------- MOV(HI) Rd, Rm ---------------- */
+      UInt h1 = INSN0(7,7);
+      UInt h2 = INSN0(6,6);
+      UInt rM = (h2 << 3) | INSN0(5,3);
+      UInt rD = (h1 << 3) | INSN0(2,0);
+      /* The old ARM ARM seems to disallow the case where both Rd and
+         Rm are "low" registers, but newer versions allow it. */
+      if (1 /*h1 != 0 || h2 != 0*/) {
+         IRTemp val = newTemp(Ity_I32);
+         assign( val, getIRegT(rM) );
+         if (rD != 15) {
+            putIRegT( rD, mkexpr(val), condT );
+         } else {
+            /* Only allowed outside or last-in IT block; SIGILL if not so. */
+            gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
+            /* jump over insn if not selected */
+            mk_skip_over_T16_if_cond_is_false(condT);
+            condT = IRTemp_INVALID;
+            // now uncond
+            /* non-interworking branch */
+            irsb->next = binop(Iop_Or32, mkexpr(val), mkU32(1));
+            irsb->jumpkind = Ijk_Boring;
+            dres.whatNext = Dis_StopHere;
+         }
+         DIP("mov r%u, r%u\n", rD, rM);
+         goto decode_success;
+      }
+      break;
+   }
+
+   case BITS8(1,0,1,1,1,1,1,1): {
+      /* ---------------- IT (if-then) ---------------- */
+      UInt firstcond = INSN0(7,4);
+      UInt mask = INSN0(3,0);
+      UInt newITSTATE = 0;
+      /* This is the ITSTATE represented as described in
+         libvex_guest_arm.h.  It is not the ARM ARM representation. */
+      UChar c1 = '.';
+      UChar c2 = '.';
+      UChar c3 = '.';
+      Bool valid = compute_ITSTATE( &newITSTATE, &c1, &c2, &c3,
+                                    firstcond, mask );
+      if (valid && firstcond != 0xF/*NV*/) {
+         /* Not allowed in an IT block; SIGILL if so. */
+         gen_SIGILL_T_if_in_ITBlock(old_itstate, new_itstate);
+
+         IRTemp t = newTemp(Ity_I32);
+         assign(t, mkU32(newITSTATE));
+         put_ITSTATE(t);
+
+         DIP("it%c%c%c %s\n", c1, c2, c3, nCC(firstcond));
+         goto decode_success;
+      }
+      break;
+   }
+
+   case BITS8(1,0,1,1,0,0,0,1):
+   case BITS8(1,0,1,1,0,0,1,1):
+   case BITS8(1,0,1,1,1,0,0,1):
+   case BITS8(1,0,1,1,1,0,1,1): {
+      /* ---------------- CB{N}Z ---------------- */
+      UInt rN    = INSN0(2,0);
+      UInt bOP   = INSN0(11,11);
+      UInt imm32 = (INSN0(9,9) << 6) | (INSN0(7,3) << 1);
+      gen_SIGILL_T_if_in_ITBlock(old_itstate, new_itstate);
+      /* It's a conditional branch forward. */
+      IRTemp kond = newTemp(Ity_I1);
+      assign( kond, binop(bOP ? Iop_CmpNE32 : Iop_CmpEQ32,
+                          getIRegT(rN), mkU32(0)) );
+
+      vassert(0 == (guest_R15_curr_instr_notENC & 1));
+      /* Looks like the nearest insn we can branch to is the one after
+         next.  That makes sense, as there's no point in being able to
+         encode a conditional branch to the next instruction. */
+      UInt dst = (guest_R15_curr_instr_notENC + 4 + imm32) | 1;
+      stmt(IRStmt_Exit( mkexpr(kond),
+                        Ijk_Boring,
+                        IRConst_U32(toUInt(dst)) ));
+      DIP("cb%s r%u, 0x%x\n", bOP ? "nz" : "z", rN, dst - 1);
+      goto decode_success;
+   }
+
+   default:
+      break; /* examine the next shortest prefix */
+
+   }
+
+
+   /* ================ 16-bit 15:9 cases ================ */
+
+   switch (INSN0(15,9)) {
+
+   case BITS7(1,0,1,1,0,1,0): {
+      /* ---------------- PUSH ---------------- */
+      /* This is a bit like STMxx, but way simpler. Complications we
+         don't have to deal with:
+         * SP being one of the transferred registers
+         * direction (increment vs decrement)
+         * before-vs-after-ness
+      */
+      Int  i, nRegs;
+      UInt bitR    = INSN0(8,8);
+      UInt regList = INSN0(7,0);
+      if (bitR) regList |= (1 << 14);
+   
+      if (regList != 0) {
+         /* Since we can't generate a guaranteed non-trapping IR
+            sequence, (1) jump over the insn if it is gated false, and
+            (2) back out the ITSTATE update. */
+         mk_skip_over_T16_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+         put_ITSTATE(old_itstate);
+         // now uncond
+
+         nRegs = 0;
+         for (i = 0; i < 16; i++) {
+            if ((regList & (1 << i)) != 0)
+               nRegs++;
+         }
+         vassert(nRegs >= 1 && nRegs <= 8);
+
+         /* Move SP down first of all, so we're "covered".  And don't
+            mess with its alignment. */
+         IRTemp newSP = newTemp(Ity_I32);
+         assign(newSP, binop(Iop_Sub32, getIRegT(13), mkU32(4 * nRegs)));
+         putIRegT(13, mkexpr(newSP), IRTemp_INVALID);
+
+         /* Generate a transfer base address as a forced-aligned
+            version of the final SP value. */
+         IRTemp base = newTemp(Ity_I32);
+         assign(base, binop(Iop_And32, mkexpr(newSP), mkU32(~3)));
+
+         /* Now the transfers */
+         nRegs = 0;
+         for (i = 0; i < 16; i++) {
+            if ((regList & (1 << i)) != 0) {
+               storeLE( binop(Iop_Add32, mkexpr(base), mkU32(4 * nRegs)),
+                        getIRegT(i) );
+               nRegs++;
+            }
+         }
+
+         /* Reinstate the ITSTATE update. */
+         put_ITSTATE(new_itstate);
+
+         DIP("push {%s0x%04x}\n", bitR ? "lr," : "", regList & 0xFF);
+         goto decode_success;
+      }
+      break;
+   }
+
+   case BITS7(1,0,1,1,1,1,0): {
+      /* ---------------- POP ---------------- */
+      Int  i, nRegs;
+      UInt bitR    = INSN0(8,8);
+      UInt regList = INSN0(7,0);
+   
+      if (regList != 0 || bitR) {
+         /* Since we can't generate a guaranteed non-trapping IR
+            sequence, (1) jump over the insn if it is gated false, and
+            (2) back out the ITSTATE update. */
+         mk_skip_over_T16_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+         put_ITSTATE(old_itstate);
+         // now uncond
+
+         nRegs = 0;
+         for (i = 0; i < 8; i++) {
+            if ((regList & (1 << i)) != 0)
+               nRegs++;
+         }
+         vassert(nRegs >= 0 && nRegs <= 7);
+         vassert(bitR == 0 || bitR == 1);
+
+         IRTemp oldSP = newTemp(Ity_I32);
+         assign(oldSP, getIRegT(13));
+
+         /* Generate a transfer base address as a forced-aligned
+            version of the original SP value. */
+         IRTemp base = newTemp(Ity_I32);
+         assign(base, binop(Iop_And32, mkexpr(oldSP), mkU32(~3)));
+
+         /* Compute a new value for SP, but don't install it yet, so
+            that we're "covered" until all the transfers are done.
+            And don't mess with its alignment. */
+         IRTemp newSP = newTemp(Ity_I32);
+         assign(newSP, binop(Iop_Add32, mkexpr(oldSP),
+                                        mkU32(4 * (nRegs + bitR))));
+
+         /* Now the transfers, not including PC */
+         nRegs = 0;
+         for (i = 0; i < 8; i++) {
+            if ((regList & (1 << i)) != 0) {
+               putIRegT(i, loadLE( Ity_I32,
+                                   binop(Iop_Add32, mkexpr(base),
+                                                    mkU32(4 * nRegs))),
+                           IRTemp_INVALID );
+               nRegs++;
+            }
+         }
+
+         IRTemp newPC = IRTemp_INVALID;
+         if (bitR) {
+            newPC = newTemp(Ity_I32);
+            assign( newPC, loadLE( Ity_I32,
+                                   binop(Iop_Add32, mkexpr(base),
+                                                    mkU32(4 * nRegs))));
+         }
+
+         /* Now we can safely install the new SP value */
+         putIRegT(13, mkexpr(newSP), IRTemp_INVALID);
+
+         /* Reinstate the ITSTATE update. */
+         put_ITSTATE(new_itstate);
+
+         /* now, do we also have to do a branch?  If so, it turns out
+            that the new PC value is encoded exactly as we need it to
+            be -- with CPSR.T in the bottom bit.  So we can simply use
+            it as is, no need to mess with it.  Note, therefore, this
+            is an interworking return. */
+         if (bitR) {
+            irsb->next     = mkexpr(newPC);
+            irsb->jumpkind = Ijk_Ret;
+            dres.whatNext  = Dis_StopHere;
+         }
+
+         DIP("pop {%s0x%04x}\n", bitR ? "pc," : "", regList & 0xFF);
+         goto decode_success;
+      }
+      break;
+   }
+
+   case BITS7(0,0,0,1,1,1,0):   /* ADDS */
+   case BITS7(0,0,0,1,1,1,1): { /* SUBS */
+      /* ---------------- ADDS Rd, Rn, #uimm3 ---------------- */
+      /* ---------------- SUBS Rd, Rn, #uimm3 ---------------- */
+      UInt   uimm3 = INSN0(8,6);
+      UInt   rN    = INSN0(5,3);
+      UInt   rD    = INSN0(2,0);
+      UInt   isSub = INSN0(9,9);
+      IRTemp argL  = newTemp(Ity_I32);
+      IRTemp argR  = newTemp(Ity_I32);
+      assign( argL, getIRegT(rN) );
+      assign( argR, mkU32(uimm3) );
+      putIRegT(rD, binop(isSub ? Iop_Sub32 : Iop_Add32,
+                         mkexpr(argL), mkexpr(argR)),
+                   condT);
+      setFlags_D1_D2( isSub ? ARMG_CC_OP_SUB : ARMG_CC_OP_ADD,
+                      argL, argR, cond_AND_notInIT_T );
+      DIP("%s r%u, r%u, #%u\n", isSub ? "subs" : "adds", rD, rN, uimm3);
+      goto decode_success;
+   }
+
+   case BITS7(0,0,0,1,1,0,0):   /* ADDS */
+   case BITS7(0,0,0,1,1,0,1): { /* SUBS */
+      /* ---------------- ADDS Rd, Rn, Rm ---------------- */
+      /* ---------------- SUBS Rd, Rn, Rm ---------------- */
+      UInt   rM    = INSN0(8,6);
+      UInt   rN    = INSN0(5,3);
+      UInt   rD    = INSN0(2,0);
+      UInt   isSub = INSN0(9,9);
+      IRTemp argL  = newTemp(Ity_I32);
+      IRTemp argR  = newTemp(Ity_I32);
+      assign( argL, getIRegT(rN) );
+      assign( argR, getIRegT(rM) );
+      putIRegT( rD, binop(isSub ? Iop_Sub32 : Iop_Add32,
+                          mkexpr(argL), mkexpr(argR)),
+                    condT );
+      setFlags_D1_D2( isSub ? ARMG_CC_OP_SUB : ARMG_CC_OP_ADD,
+                      argL, argR, cond_AND_notInIT_T );
+      DIP("%s r%u, r%u, r%u\n", isSub ? "subs" : "adds", rD, rN, rM);
+      goto decode_success;
+   }
+
+   case BITS7(0,1,0,1,0,0,0):   /* STR */
+   case BITS7(0,1,0,1,1,0,0): { /* LDR */
+      /* ------------- LDR Rd, [Rn, Rm] ------------- */
+      /* ------------- STR Rd, [Rn, Rm] ------------- */
+      /* LDR/STR Rd, [Rn + Rm] */
+      UInt    rD   = INSN0(2,0);
+      UInt    rN   = INSN0(5,3);
+      UInt    rM   = INSN0(8,6);
+      UInt    isLD = INSN0(11,11);
+
+      mk_skip_over_T16_if_cond_is_false(condT);
+      condT = IRTemp_INVALID;
+      // now uncond
+
+      IRExpr* ea = binop(Iop_Add32, getIRegT(rN), getIRegT(rM));
+      put_ITSTATE(old_itstate); // backout
+      if (isLD) {
+         putIRegT(rD, loadLE(Ity_I32, ea), IRTemp_INVALID);
+      } else {
+         storeLE(ea, getIRegT(rD));
+      }
+      put_ITSTATE(new_itstate); // restore
+
+      DIP("%s r%u, [r%u, r%u]\n", isLD ? "ldr" : "str", rD, rN, rM);
+      goto decode_success;
+   }
+
+   case BITS7(0,1,0,1,0,0,1):
+   case BITS7(0,1,0,1,1,0,1): {
+      /* ------------- LDRH Rd, [Rn, Rm] ------------- */
+      /* ------------- STRH Rd, [Rn, Rm] ------------- */
+      /* LDRH/STRH Rd, [Rn + Rm] */
+      UInt    rD   = INSN0(2,0);
+      UInt    rN   = INSN0(5,3);
+      UInt    rM   = INSN0(8,6);
+      UInt    isLD = INSN0(11,11);
+
+      mk_skip_over_T16_if_cond_is_false(condT);
+      condT = IRTemp_INVALID;
+      // now uncond
+
+      IRExpr* ea = binop(Iop_Add32, getIRegT(rN), getIRegT(rM));
+      put_ITSTATE(old_itstate); // backout
+      if (isLD) {
+         putIRegT(rD, unop(Iop_16Uto32, loadLE(Ity_I16, ea)),
+                      IRTemp_INVALID);
+      } else {
+         storeLE( ea, unop(Iop_32to16, getIRegT(rD)) );
+      }
+      put_ITSTATE(new_itstate); // restore
+
+      DIP("%sh r%u, [r%u, r%u]\n", isLD ? "ldr" : "str", rD, rN, rM);
+      goto decode_success;
+   }
+
+   case BITS7(0,1,0,1,1,1,1): {
+      /* ------------- LDRSH Rd, [Rn, Rm] ------------- */
+      /* LDRSH Rd, [Rn + Rm] */
+      UInt    rD = INSN0(2,0);
+      UInt    rN = INSN0(5,3);
+      UInt    rM = INSN0(8,6);
+
+      mk_skip_over_T16_if_cond_is_false(condT);
+      condT = IRTemp_INVALID;
+      // now uncond
+
+      IRExpr* ea = binop(Iop_Add32, getIRegT(rN), getIRegT(rM));
+      put_ITSTATE(old_itstate); // backout
+      putIRegT(rD, unop(Iop_16Sto32, loadLE(Ity_I16, ea)),
+                   IRTemp_INVALID);
+      put_ITSTATE(new_itstate); // restore
+
+      DIP("ldrsh r%u, [r%u, r%u]\n", rD, rN, rM);
+      goto decode_success;
+   }
+
+   case BITS7(0,1,0,1,0,1,1): {
+      /* ------------- LDRSB Rd, [Rn, Rm] ------------- */
+      /* LDRSB Rd, [Rn + Rm] */
+      UInt    rD = INSN0(2,0);
+      UInt    rN = INSN0(5,3);
+      UInt    rM = INSN0(8,6);
+
+      mk_skip_over_T16_if_cond_is_false(condT);
+      condT = IRTemp_INVALID;
+      // now uncond
+
+      IRExpr* ea = binop(Iop_Add32, getIRegT(rN), getIRegT(rM));
+      put_ITSTATE(old_itstate); // backout
+      putIRegT(rD, unop(Iop_8Sto32, loadLE(Ity_I8, ea)),
+                   IRTemp_INVALID);
+      put_ITSTATE(new_itstate); // restore
+
+      DIP("ldrsb r%u, [r%u, r%u]\n", rD, rN, rM);
+      goto decode_success;
+   }
+
+   case BITS7(0,1,0,1,0,1,0):
+   case BITS7(0,1,0,1,1,1,0): {
+      /* ------------- LDRB Rd, [Rn, Rm] ------------- */
+      /* ------------- STRB Rd, [Rn, Rm] ------------- */
+      /* LDRB/STRB Rd, [Rn + Rm] */
+      UInt    rD   = INSN0(2,0);
+      UInt    rN   = INSN0(5,3);
+      UInt    rM   = INSN0(8,6);
+      UInt    isLD = INSN0(11,11);
+
+      mk_skip_over_T16_if_cond_is_false(condT);
+      condT = IRTemp_INVALID;
+      // now uncond
+
+      IRExpr* ea = binop(Iop_Add32, getIRegT(rN), getIRegT(rM));
+      put_ITSTATE(old_itstate); // backout
+      if (isLD) {
+         putIRegT(rD, unop(Iop_8Uto32, loadLE(Ity_I8, ea)),
+                  IRTemp_INVALID);
+      } else {
+         storeLE( ea, unop(Iop_32to8, getIRegT(rD)) );
+      }
+      put_ITSTATE(new_itstate); // restore
+
+      DIP("%sb r%u, [r%u, r%u]\n", isLD ? "ldr" : "str", rD, rN, rM);
+      goto decode_success;
+   }
+
+   default:
+      break; /* examine the next shortest prefix */
+
+   }
+
+
+   /* ================ 16-bit 15:11 cases ================ */
+
+   switch (INSN0(15,11)) {
+
+   case BITS5(0,0,1,1,0):
+   case BITS5(0,0,1,1,1): {
+      /* ---------------- ADDS Rn, #uimm8 ---------------- */
+      /* ---------------- SUBS Rn, #uimm8 ---------------- */
+      UInt   isSub = INSN0(11,11);
+      UInt   rN    = INSN0(10,8);
+      UInt   uimm8 = INSN0(7,0);
+      IRTemp argL  = newTemp(Ity_I32);
+      IRTemp argR  = newTemp(Ity_I32);
+      assign( argL, getIRegT(rN) );
+      assign( argR, mkU32(uimm8) );
+      putIRegT( rN, binop(isSub ? Iop_Sub32 : Iop_Add32,
+                          mkexpr(argL), mkexpr(argR)), condT );
+      setFlags_D1_D2( isSub ? ARMG_CC_OP_SUB : ARMG_CC_OP_ADD,
+                      argL, argR, cond_AND_notInIT_T );
+      DIP("%s r%u, #%u\n", isSub ? "subs" : "adds", rN, uimm8);
+      goto decode_success;
+   }
+
+   case BITS5(1,0,1,0,0): {
+      /* ---------------- ADD rD, PC, #imm8 * 4 ---------------- */
+      /* a.k.a. ADR */
+      /* rD = align4(PC) + imm8 * 4 */
+      UInt rD   = INSN0(10,8);
+      UInt imm8 = INSN0(7,0);
+      putIRegT(rD, binop(Iop_Add32, 
+                         binop(Iop_And32, getIRegT(15), mkU32(~3U)),
+                         mkU32(imm8 * 4)),
+                   condT);
+      DIP("add r%u, pc, #%u\n", rD, imm8 * 4);
+      goto decode_success;
+   }
+
+   case BITS5(1,0,1,0,1): {
+      /* ---------------- ADD rD, SP, #imm8 * 4 ---------------- */
+      UInt rD   = INSN0(10,8);
+      UInt imm8 = INSN0(7,0);
+      putIRegT(rD, binop(Iop_Add32, getIRegT(13), mkU32(imm8 * 4)),
+                   condT);
+      DIP("add r%u, r13, #%u\n", rD, imm8 * 4);
+      goto decode_success;
+   }
+
+   case BITS5(0,0,1,0,1): {
+      /* ---------------- CMP Rn, #uimm8 ---------------- */
+      UInt   rN    = INSN0(10,8);
+      UInt   uimm8 = INSN0(7,0);
+      IRTemp argL  = newTemp(Ity_I32);
+      IRTemp argR  = newTemp(Ity_I32);
+      assign( argL, getIRegT(rN) );
+      assign( argR, mkU32(uimm8) );
+      /* Update flags regardless of whether in an IT block or not. */
+      setFlags_D1_D2( ARMG_CC_OP_SUB, argL, argR, condT );
+      DIP("cmp r%u, #%u\n", rN, uimm8);
+      goto decode_success;
+   }
+
+   case BITS5(0,0,1,0,0): {
+      /* -------------- (T1) MOVS Rn, #uimm8 -------------- */
+      UInt   rD    = INSN0(10,8);
+      UInt   uimm8 = INSN0(7,0);
+      IRTemp oldV  = newTemp(Ity_I32);
+      IRTemp oldC  = newTemp(Ity_I32);
+      IRTemp res   = newTemp(Ity_I32);
+      assign( oldV, mk_armg_calculate_flag_v() );
+      assign( oldC, mk_armg_calculate_flag_c() );
+      assign( res, mkU32(uimm8) );
+      putIRegT(rD, mkexpr(res), condT);
+      setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, oldC, oldV,
+                         cond_AND_notInIT_T );
+      DIP("movs r%u, #%u\n", rD, uimm8);
+      goto decode_success;
+   }
+
+   case BITS5(0,1,0,0,1): {
+      /* ------------- LDR Rd, [PC, #imm8 * 4] ------------- */
+      /* LDR Rd, [align4(PC) + imm8 * 4] */
+      UInt   rD   = INSN0(10,8);
+      UInt   imm8 = INSN0(7,0);
+      IRTemp ea   = newTemp(Ity_I32);
+
+      mk_skip_over_T16_if_cond_is_false(condT);
+      condT = IRTemp_INVALID;
+      // now uncond
+
+      assign(ea, binop(Iop_Add32, 
+                       binop(Iop_And32, getIRegT(15), mkU32(~3U)),
+                       mkU32(imm8 * 4)));
+      put_ITSTATE(old_itstate); // backout
+      putIRegT(rD, loadLE(Ity_I32, mkexpr(ea)),
+                   IRTemp_INVALID);
+      put_ITSTATE(new_itstate); // restore
+
+      DIP("ldr r%u, [pc, #%u]\n", rD, imm8 * 4);
+      goto decode_success;
+   }
+
+   case BITS5(0,1,1,0,0):   /* STR */
+   case BITS5(0,1,1,0,1): { /* LDR */
+      /* ------------- LDR Rd, [Rn, #imm5 * 4] ------------- */
+      /* ------------- STR Rd, [Rn, #imm5 * 4] ------------- */
+      /* LDR/STR Rd, [Rn + imm5 * 4] */
+      UInt    rD   = INSN0(2,0);
+      UInt    rN   = INSN0(5,3);
+      UInt    imm5 = INSN0(10,6);
+      UInt    isLD = INSN0(11,11);
+
+      mk_skip_over_T16_if_cond_is_false(condT);
+      condT = IRTemp_INVALID;
+      // now uncond
+
+      IRExpr* ea = binop(Iop_Add32, getIRegT(rN), mkU32(imm5 * 4));
+      put_ITSTATE(old_itstate); // backout
+      if (isLD) {
+         putIRegT(rD, loadLE(Ity_I32, ea), IRTemp_INVALID);
+      } else {
+         storeLE( ea, getIRegT(rD) );
+      }
+      put_ITSTATE(new_itstate); // restore
+
+      DIP("%s r%u, [r%u, #%u]\n", isLD ? "ldr" : "str", rD, rN, imm5 * 4);
+      goto decode_success;
+   }
+
+   case BITS5(1,0,0,0,0):   /* STRH */
+   case BITS5(1,0,0,0,1): { /* LDRH */
+      /* ------------- LDRH Rd, [Rn, #imm5 * 2] ------------- */
+      /* ------------- STRH Rd, [Rn, #imm5 * 2] ------------- */
+      /* LDRH/STRH Rd, [Rn + imm5 * 2] */
+      UInt    rD   = INSN0(2,0);
+      UInt    rN   = INSN0(5,3);
+      UInt    imm5 = INSN0(10,6);
+      UInt    isLD = INSN0(11,11);
+
+      mk_skip_over_T16_if_cond_is_false(condT);
+      condT = IRTemp_INVALID;
+      // now uncond
+
+      IRExpr* ea = binop(Iop_Add32, getIRegT(rN), mkU32(imm5 * 2));
+      put_ITSTATE(old_itstate); // backout
+      if (isLD) {
+         putIRegT(rD, unop(Iop_16Uto32, loadLE(Ity_I16, ea)),
+                  IRTemp_INVALID);
+      } else {
+         storeLE( ea, unop(Iop_32to16, getIRegT(rD)) );
+      }
+      put_ITSTATE(new_itstate); // restore
+
+      DIP("%sh r%u, [r%u, #%u]\n", isLD ? "ldr" : "str", rD, rN, imm5 * 2);
+      goto decode_success;
+   }
+
+   case BITS5(0,1,1,1,0):   /* STRB */
+   case BITS5(0,1,1,1,1): { /* LDRB */
+      /* ------------- LDRB Rd, [Rn, #imm5] ------------- */
+      /* ------------- STRB Rd, [Rn, #imm5] ------------- */
+      /* LDRB/STRB Rd, [Rn + imm5] */
+      UInt    rD   = INSN0(2,0);
+      UInt    rN   = INSN0(5,3);
+      UInt    imm5 = INSN0(10,6);
+      UInt    isLD = INSN0(11,11);
+
+      mk_skip_over_T16_if_cond_is_false(condT);
+      condT = IRTemp_INVALID;
+      // now uncond
+
+      IRExpr* ea = binop(Iop_Add32, getIRegT(rN), mkU32(imm5));
+      put_ITSTATE(old_itstate); // backout
+      if (isLD) {
+         putIRegT(rD, unop(Iop_8Uto32, loadLE(Ity_I8, ea)),
+                  IRTemp_INVALID);
+      } else {
+         storeLE( ea, unop(Iop_32to8, getIRegT(rD)) );
+      }
+      put_ITSTATE(new_itstate); // restore
+
+      DIP("%sb r%u, [r%u, #%u]\n", isLD ? "ldr" : "str", rD, rN, imm5);
+      goto decode_success;
+   }
+
+   case BITS5(1,0,0,1,0):   /* STR */
+   case BITS5(1,0,0,1,1): { /* LDR */
+      /* ------------- LDR Rd, [SP, #imm8 * 4] ------------- */
+      /* ------------- STR Rd, [SP, #imm8 * 4] ------------- */
+      /* LDR/STR Rd, [SP + imm8 * 4] */
+      UInt rD    = INSN0(10,8);
+      UInt imm8  = INSN0(7,0);
+      UInt isLD  = INSN0(11,11);
+
+      mk_skip_over_T16_if_cond_is_false(condT);
+      condT = IRTemp_INVALID;
+      // now uncond
+
+      IRExpr* ea = binop(Iop_Add32, getIRegT(13), mkU32(imm8 * 4));
+      put_ITSTATE(old_itstate); // backout
+      if (isLD) {
+         putIRegT(rD, loadLE(Ity_I32, ea), IRTemp_INVALID);
+      } else {
+         storeLE(ea, getIRegT(rD));
+      }
+      put_ITSTATE(new_itstate); // restore
+
+      DIP("%s r%u, [sp, #%u]\n", isLD ? "ldr" : "str", rD, imm8 * 4);
+      goto decode_success;
+   }
+
+   case BITS5(1,1,0,0,1): {
+      /* ------------- LDMIA Rn!, {reglist} ------------- */
+      Int i, nRegs = 0;
+      UInt rN   = INSN0(10,8);
+      UInt list = INSN0(7,0);
+      /* Empty lists aren't allowed. */
+      if (list != 0) {
+         mk_skip_over_T16_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+         put_ITSTATE(old_itstate);
+         // now uncond
+
+         IRTemp oldRn = newTemp(Ity_I32);
+         IRTemp base  = newTemp(Ity_I32);
+         assign(oldRn, getIRegT(rN));
+         assign(base, binop(Iop_And32, mkexpr(oldRn), mkU32(~3U)));
+         for (i = 0; i < 8; i++) {
+            if (0 == (list & (1 << i)))
+               continue;
+            nRegs++;
+            putIRegT(
+               i, loadLE(Ity_I32,
+                         binop(Iop_Add32, mkexpr(base),
+                                          mkU32(nRegs * 4 - 4))),
+               IRTemp_INVALID
+            );
+         }
+         /* Only do the writeback for rN if it isn't in the list of
+            registers to be transferred. */
+         if (0 == (list & (1 << rN))) {
+            putIRegT(rN,
+                     binop(Iop_Add32, mkexpr(oldRn),
+                                      mkU32(nRegs * 4)),
+                     IRTemp_INVALID
+            );
+         }
+
+         /* Reinstate the ITSTATE update. */
+         put_ITSTATE(new_itstate);
+
+         DIP("ldmia r%u!, {0x%04x}\n", rN, list);
+         goto decode_success;
+      }
+      break;
+   }
+
+   case BITS5(1,1,0,0,0): {
+      /* ------------- STMIA Rn!, {reglist} ------------- */
+      Int i, nRegs = 0;
+      UInt rN   = INSN0(10,8);
+      UInt list = INSN0(7,0);
+      /* Empty lists aren't allowed.  Also, if rN is in the list then
+         it must be the lowest numbered register in the list. */
+      Bool valid = list != 0;
+      if (valid && 0 != (list & (1 << rN))) {
+         for (i = 0; i < rN; i++) {
+            if (0 != (list & (1 << i)))
+               valid = False;
+         }
+      }
+      if (valid) {
+         mk_skip_over_T16_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+         put_ITSTATE(old_itstate);
+         // now uncond
+
+         IRTemp oldRn = newTemp(Ity_I32);
+         IRTemp base = newTemp(Ity_I32);
+         assign(oldRn, getIRegT(rN));
+         assign(base, binop(Iop_And32, mkexpr(oldRn), mkU32(~3U)));
+         for (i = 0; i < 8; i++) {
+            if (0 == (list & (1 << i)))
+               continue;
+            nRegs++;
+            storeLE( binop(Iop_Add32, mkexpr(base), mkU32(nRegs * 4 - 4)),
+                     getIRegT(i) );
+         }
+         /* Always do the writeback. */
+         putIRegT(rN,
+                  binop(Iop_Add32, mkexpr(oldRn),
+                                   mkU32(nRegs * 4)),
+                  IRTemp_INVALID);
+
+         /* Reinstate the ITSTATE update. */
+         put_ITSTATE(new_itstate);
+
+         DIP("stmia r%u!, {0x%04x}\n", rN, list);
+         goto decode_success;
+      }
+      break;
+   }
+
+   case BITS5(0,0,0,0,0):   /* LSLS */
+   case BITS5(0,0,0,0,1):   /* LSRS */
+   case BITS5(0,0,0,1,0): { /* ASRS */
+      /* ---------------- LSLS Rd, Rm, #imm5 ---------------- */
+      /* ---------------- LSRS Rd, Rm, #imm5 ---------------- */
+      /* ---------------- ASRS Rd, Rm, #imm5 ---------------- */
+      UInt   rD   = INSN0(2,0);
+      UInt   rM   = INSN0(5,3);
+      UInt   imm5 = INSN0(10,6);
+      IRTemp res  = newTemp(Ity_I32);
+      IRTemp resC = newTemp(Ity_I32);
+      IRTemp rMt  = newTemp(Ity_I32);
+      IRTemp oldV = newTemp(Ity_I32);
+      HChar* wot  = "???";
+      assign(rMt, getIRegT(rM));
+      assign(oldV, mk_armg_calculate_flag_v());
+      /* Looks like INSN0(12,11) are the standard 'how' encoding.
+         Could compactify if the ROR case later appears. */
+      switch (INSN0(15,11)) {
+         case BITS5(0,0,0,0,0):
+            compute_result_and_C_after_LSL_by_imm5(
+               dis_buf, &res, &resC, rMt, imm5, rM
+            );
+            wot = "lsl";
+            break;
+         case BITS5(0,0,0,0,1):
+            compute_result_and_C_after_LSR_by_imm5(
+               dis_buf, &res, &resC, rMt, imm5, rM
+            );
+            wot = "lsr";
+            break;
+         case BITS5(0,0,0,1,0):
+            compute_result_and_C_after_ASR_by_imm5(
+               dis_buf, &res, &resC, rMt, imm5, rM
+            );
+            wot = "asr";
+            break;
+         default:
+            /*NOTREACHED*/vassert(0);
+      }
+      // not safe to read guest state after this point
+      putIRegT(rD, mkexpr(res), condT);
+      setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, resC, oldV,
+                         cond_AND_notInIT_T );
+      /* ignore buf and roll our own output */
+      DIP("%ss r%u, r%u, #%u\n", wot, rD, rM, imm5);
+      goto decode_success;
+   }
+
+   case BITS5(1,1,1,0,0): {
+      /* ---------------- B #simm11 ---------------- */
+      Int  simm11 = INSN0(10,0);
+           simm11 = (simm11 << 21) >> 20;
+      UInt dst    = simm11 + guest_R15_curr_instr_notENC + 4;
+      /* Only allowed outside or last-in IT block; SIGILL if not so. */
+      gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
+      // and skip this insn if not selected; being cleverer is too
+      // difficult
+      mk_skip_over_T16_if_cond_is_false(condT);
+      condT = IRTemp_INVALID;
+      // now uncond
+      irsb->next     = mkU32( dst | 1 /*CPSR.T*/ );
+      irsb->jumpkind = Ijk_Boring;
+      dres.whatNext  = Dis_StopHere;
+      DIP("b 0x%x\n", dst);
+      goto decode_success;
+   }
+
+   default:
+      break; /* examine the next shortest prefix */
+
+   }
+
+
+   /* ================ 16-bit 15:12 cases ================ */
+
+   switch (INSN0(15,12)) {
+
+   case BITS4(1,1,0,1): {
+      /* ---------------- Bcond #simm8 ---------------- */
+      UInt cond  = INSN0(11,8);
+      Int  simm8 = INSN0(7,0);
+           simm8 = (simm8 << 24) >> 23;
+      UInt dst   = simm8 + guest_R15_curr_instr_notENC + 4;
+      if (cond != ARMCondAL && cond != ARMCondNV) {
+         /* Not allowed in an IT block; SIGILL if so. */
+         gen_SIGILL_T_if_in_ITBlock(old_itstate, new_itstate);
+
+         IRTemp kondT = newTemp(Ity_I32);
+         assign( kondT, mk_armg_calculate_condition(cond) );
+         stmt( IRStmt_Exit( unop(Iop_32to1, mkexpr(kondT)),
+                            Ijk_Boring,
+                            IRConst_U32(dst | 1/*CPSR.T*/) ));
+         irsb->next = mkU32( (guest_R15_curr_instr_notENC + 2) 
+                             | 1 /*CPSR.T*/ );
+         irsb->jumpkind = Ijk_Boring;
+         dres.whatNext  = Dis_StopHere;
+         DIP("b%s 0x%x\n", nCC(cond), dst);
+         goto decode_success;
+      }
+      break;
+   }
+
+   default:
+      break; /* hmm, nothing matched */
+
+   }
+
+   /* ================ 16-bit misc cases ================ */
+
+   /* ------ NOP ------ */
+   if (INSN0(15,0) == 0xBF00) {
+      DIP("nop");
+      goto decode_success;
+   }
+
+   /* ----------------------------------------------------------- */
+   /* --                                                       -- */
+   /* -- Thumb 32-bit integer instructions                     -- */
+   /* --                                                       -- */
+   /* ----------------------------------------------------------- */
+
+#  define INSN1(_bMax,_bMin)  SLICE_UInt(((UInt)insn1), (_bMax), (_bMin))
+
+   /* second 16 bits of the instruction, if any */
+   UShort insn1 = getUShortLittleEndianly( guest_instr+2 );
+
+   anOp   = Iop_INVALID; /* paranoia */
+   anOpNm = NULL;        /* paranoia */
+
+   /* Change result defaults to suit 32-bit insns. */
+   vassert(dres.whatNext   == Dis_Continue);
+   vassert(dres.len        == 2);
+   vassert(dres.continueAt == 0);
+   dres.len = 4;
+
+   /* ---------------- BL/BLX simm26 ---------------- */
+   if (BITS5(1,1,1,1,0) == INSN0(15,11) && BITS2(1,1) == INSN1(15,14)) {
+      UInt isBL = INSN1(12,12);
+      UInt bS   = INSN0(10,10);
+      UInt bJ1  = INSN1(13,13);
+      UInt bJ2  = INSN1(11,11);
+      UInt bI1  = 1 ^ (bJ1 ^ bS);
+      UInt bI2  = 1 ^ (bJ2 ^ bS);
+      Int simm25
+         =   (bS          << (1 + 1 + 10 + 11 + 1))
+           | (bI1         << (1 + 10 + 11 + 1))
+           | (bI2         << (10 + 11 + 1))
+           | (INSN0(9,0)  << (11 + 1))
+           | (INSN1(10,0) << 1);
+      simm25 = (simm25 << 7) >> 7;
+
+      vassert(0 == (guest_R15_curr_instr_notENC & 1));
+      UInt dst = simm25 + guest_R15_curr_instr_notENC + 4;
+
+      /* One further validity case to check: in the case of BLX
+         (not-BL), that insn1[0] must be zero. */
+      Bool valid = True;
+      if (isBL == 0 && INSN1(0,0) == 1) valid = False;
+      if (valid) {
+         /* Only allowed outside or last-in IT block; SIGILL if not so. */
+         gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
+         // and skip this insn if not selected; being cleverer is too
+         // difficult
+         mk_skip_over_T32_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+         // now uncond
+
+         /* We're returning to Thumb code, hence "| 1" */
+         putIRegT( 14, mkU32( (guest_R15_curr_instr_notENC + 4) | 1 ),
+                   IRTemp_INVALID);
+         if (isBL) {
+            /* BL: unconditional T -> T call */
+            /* we're calling Thumb code, hence "| 1" */
+            irsb->next = mkU32( dst | 1 );
+            DIP("bl 0x%x (stay in Thumb mode)\n", dst);
+         } else {
+            /* BLX: unconditional T -> A call */
+            /* we're calling ARM code, hence "& 3" to align to a
+               valid ARM insn address */
+            irsb->next = mkU32( dst & ~3 );
+            DIP("blx 0x%x (switch to ARM mode)\n", dst & ~3);
+         }
+         irsb->jumpkind = Ijk_Call;
+         dres.whatNext = Dis_StopHere;
+         goto decode_success;
+      }
+   }
+
+   /* ---------------- {LD,ST}M{IA,DB} ---------------- */
+   if (0x3a2 == INSN0(15,6) // {LD,ST}MIA
+       || 0x3a4 == INSN0(15,6)) { // {LD,ST}MDB
+      UInt bW      = INSN0(5,5); /* writeback Rn ? */
+      UInt bL      = INSN0(4,4);
+      UInt rN      = INSN0(3,0);
+      UInt bP      = INSN1(15,15); /* reglist entry for r15 */
+      UInt bM      = INSN1(14,14); /* reglist entry for r14 */
+      UInt rLmost  = INSN1(12,0);  /* reglist entry for r0 .. 12 */
+      UInt rL13    = INSN1(13,13); /* must be zero */
+      UInt regList = 0;
+      Bool valid   = True;
+
+      UInt bINC    = 1;
+      UInt bBEFORE = 0;
+      if (INSN0(15,6) == 0x3a4) {
+         bINC    = 0;
+         bBEFORE = 1;
+      }
+
+      /* detect statically invalid cases, and construct the final
+         reglist */
+      if (rL13 == 1)
+         valid = False;
+
+      if (bL == 1) {
+         regList = (bP << 15) | (bM << 14) | rLmost;
+         if (rN == 15)                       valid = False;
+         if (popcount32(regList) < 2)        valid = False;
+         if (bP == 1 && bM == 1)             valid = False;
+         if (bW == 1 && (regList & (1<<rN))) valid = False;
+      } else {
+         regList = (bM << 14) | rLmost;
+         if (bP == 1)                        valid = False;
+         if (rN == 15)                       valid = False;
+         if (popcount32(regList) < 2)        valid = False;
+         if (bW == 1 && (regList & (1<<rN))) valid = False;
+         if (regList & (1<<rN)) {
+            UInt i;
+            /* if Rn is in the list, then it must be the
+               lowest numbered entry */
+            for (i = 0; i < rN; i++) {
+               if (regList & (1<<i))
+                  valid = False;
+            }
+         }
+      }
+
+      if (valid) {
+         if (bL == 1 && bP == 1) {
+            // We'll be writing the PC.  Hence:
+            /* Only allowed outside or last-in IT block; SIGILL if not so. */
+            gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
+         }
+
+         /* Go uncond: */
+         mk_skip_over_T32_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+         // now uncond
+
+         /* Generate the IR.  This might generate a write to R15, */
+         mk_ldm_stm(False/*!arm*/, rN, bINC, bBEFORE, bW, bL, regList);
+
+         if (bL == 1 && (regList & (1<<15))) {
+            // If we wrote to R15, we have an interworking return to
+            // deal with.
+            irsb->next     = llGetIReg(15);
+            irsb->jumpkind = Ijk_Ret;
+            dres.whatNext  = Dis_StopHere;
+         }
+
+         DIP("%sm%c%c r%u%s, {0x%04x}\n",
+              bL == 1 ? "ld" : "st", bINC ? 'i' : 'd', bBEFORE ? 'b' : 'a',
+              rN, bW ? "!" : "", regList);
+
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T3) ADD{S}.W Rd, Rn, #constT -------------- */
+   if (INSN0(15,11) == BITS5(1,1,1,1,0)
+       && INSN0(9,5) == BITS5(0,1,0,0,0)
+       && INSN1(15,15) == 0) {
+      UInt bS = INSN0(4,4);
+      UInt rN = INSN0(3,0);
+      UInt rD = INSN1(11,8);
+      Bool valid = !isBadRegT(rN) && !isBadRegT(rD);
+      /* but allow "add.w reg, sp, #constT" */ 
+      if (!valid && rN == 13)
+         valid = True;
+      if (valid) {
+         IRTemp argL  = newTemp(Ity_I32);
+         IRTemp argR  = newTemp(Ity_I32);
+         IRTemp res   = newTemp(Ity_I32);
+         UInt   imm32 = thumbExpandImm_from_I0_I1(NULL, insn0, insn1);
+         assign(argL, getIRegT(rN));
+         assign(argR, mkU32(imm32));
+         assign(res,  binop(Iop_Add32, mkexpr(argL), mkexpr(argR)));
+         putIRegT(rD, mkexpr(res), condT);
+         if (bS == 1)
+            setFlags_D1_D2( ARMG_CC_OP_ADD, argL, argR, condT );
+         DIP("add%s.w r%u, r%u, #%u\n",
+             bS == 1 ? "s" : "", rD, rN, imm32);
+         goto decode_success;
+      }
+   }
+
+   /* ---------------- (T2) CMP.W Rn, #constT ---------------- */
+   /* ---------------- (T2) CMN.W Rn, #constT ---------------- */
+   if (INSN0(15,11) == BITS5(1,1,1,1,0)
+       && (   INSN0(9,4) == BITS6(0,1,1,0,1,1)  // CMP
+           || INSN0(9,4) == BITS6(0,1,0,0,0,1)) // CMN
+       && INSN1(15,15) == 0
+       && INSN1(11,8) == BITS4(1,1,1,1)) {
+      UInt rN = INSN0(3,0);
+      if (rN != 15) {
+         IRTemp argL  = newTemp(Ity_I32);
+         IRTemp argR  = newTemp(Ity_I32);
+         Bool   isCMN = INSN0(9,4) == BITS6(0,1,0,0,0,1);
+         UInt   imm32 = thumbExpandImm_from_I0_I1(NULL, insn0, insn1);
+         assign(argL, getIRegT(rN));
+         assign(argR, mkU32(imm32));
+         setFlags_D1_D2( isCMN ? ARMG_CC_OP_ADD : ARMG_CC_OP_SUB,
+                         argL, argR, condT );
+         DIP("%s.w r%u, #%u\n", isCMN ? "cmn" : "cmp", rN, imm32);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T1) TST.W Rn, #constT -------------- */
+   /* -------------- (T1) TEQ.W Rn, #constT -------------- */
+   if (INSN0(15,11) == BITS5(1,1,1,1,0)
+       && (   INSN0(9,4) == BITS6(0,0,0,0,0,1)  // TST
+           || INSN0(9,4) == BITS6(0,0,1,0,0,1)) // TEQ
+       && INSN1(15,15) == 0
+       && INSN1(11,8) == BITS4(1,1,1,1)) {
+      UInt rN = INSN0(3,0);
+      if (!isBadRegT(rN)) { // yes, really, it's inconsistent with CMP.W
+         Bool  isTST  = INSN0(9,4) == BITS6(0,0,0,0,0,1);
+         IRTemp argL  = newTemp(Ity_I32);
+         IRTemp argR  = newTemp(Ity_I32);
+         IRTemp res   = newTemp(Ity_I32);
+         IRTemp oldV  = newTemp(Ity_I32);
+         IRTemp oldC  = newTemp(Ity_I32);
+         Bool   updC  = False;
+         UInt   imm32 = thumbExpandImm_from_I0_I1(&updC, insn0, insn1);
+         assign(argL, getIRegT(rN));
+         assign(argR, mkU32(imm32));
+         assign(res,  binop(isTST ? Iop_And32 : Iop_Xor32,
+                            mkexpr(argL), mkexpr(argR)));
+         assign( oldV, mk_armg_calculate_flag_v() );
+         assign( oldC, updC 
+                       ? mkU32((imm32 >> 31) & 1)
+                       : mk_armg_calculate_flag_c() );
+         setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, oldC, oldV, condT );
+         DIP("%s.w r%u, #%u\n", isTST ? "tst" : "teq", rN, imm32);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T3) SUB{S}.W Rd, Rn, #constT -------------- */
+   /* -------------- (T3) RSB{S}.W Rd, Rn, #constT -------------- */
+   if (INSN0(15,11) == BITS5(1,1,1,1,0)
+       && (INSN0(9,5) == BITS5(0,1,1,0,1) // SUB
+           || INSN0(9,5) == BITS5(0,1,1,1,0)) // RSB
+       && INSN1(15,15) == 0) {
+      Bool isRSB = INSN0(9,5) == BITS5(0,1,1,1,0);
+      UInt bS    = INSN0(4,4);
+      UInt rN    = INSN0(3,0);
+      UInt rD    = INSN1(11,8);
+      Bool valid = !isBadRegT(rN) && !isBadRegT(rD);
+      /* but allow "sub.w sp, sp, #constT" */
+      if (!valid && !isRSB && rN == 13 && rD == 13)
+         valid = True;
+      if (valid) {
+         IRTemp argL  = newTemp(Ity_I32);
+         IRTemp argR  = newTemp(Ity_I32);
+         IRTemp res   = newTemp(Ity_I32);
+         UInt   imm32 = thumbExpandImm_from_I0_I1(NULL, insn0, insn1);
+         assign(argL, getIRegT(rN));
+         assign(argR, mkU32(imm32));
+         assign(res,  isRSB
+                      ? binop(Iop_Sub32, mkexpr(argR), mkexpr(argL))
+                      : binop(Iop_Sub32, mkexpr(argL), mkexpr(argR)));
+         putIRegT(rD, mkexpr(res), condT);
+         if (bS == 1) {
+            if (isRSB)
+               setFlags_D1_D2( ARMG_CC_OP_SUB, argR, argL, condT );
+            else
+               setFlags_D1_D2( ARMG_CC_OP_SUB, argL, argR, condT );
+         }
+         DIP("%s%s.w r%u, r%u, #%u\n",
+             isRSB ? "rsb" : "sub", bS == 1 ? "s" : "", rD, rN, imm32);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T1) ADC{S}.W Rd, Rn, #constT -------------- */
+   /* -------------- (T1) SBC{S}.W Rd, Rn, #constT -------------- */
+   if (INSN0(15,11) == BITS5(1,1,1,1,0)
+       && (   INSN0(9,5) == BITS5(0,1,0,1,0)  // ADC
+           || INSN0(9,5) == BITS5(0,1,0,1,1)) // SBC
+       && INSN1(15,15) == 0) {
+      /* ADC:  Rd = Rn + constT + oldC */
+      /* SBC:  Rd = Rn - constT - (oldC ^ 1) */
+      UInt bS    = INSN0(4,4);
+      UInt rN    = INSN0(3,0);
+      UInt rD    = INSN1(11,8);
+      if (!isBadRegT(rN) && !isBadRegT(rD)) {
+         IRTemp argL  = newTemp(Ity_I32);
+         IRTemp argR  = newTemp(Ity_I32);
+         IRTemp res   = newTemp(Ity_I32);
+         IRTemp oldC  = newTemp(Ity_I32);
+         UInt   imm32 = thumbExpandImm_from_I0_I1(NULL, insn0, insn1);
+         assign(argL, getIRegT(rN));
+         assign(argR, mkU32(imm32));
+         assign(oldC, mk_armg_calculate_flag_c() );
+         HChar* nm  = "???";
+         switch (INSN0(9,5)) {
+            case BITS5(0,1,0,1,0): // ADC
+               nm = "adc";
+               assign(res,
+                      binop(Iop_Add32,
+                            binop(Iop_Add32, mkexpr(argL), mkexpr(argR)),
+                            mkexpr(oldC) ));
+               putIRegT(rD, mkexpr(res), condT);
+               if (bS)
+                  setFlags_D1_D2_ND( ARMG_CC_OP_ADC,
+                                     argL, argR, oldC, condT );
+               break;
+            case BITS5(0,1,0,1,1): // SBC
+               nm = "sbc";
+               assign(res,
+                      binop(Iop_Sub32,
+                            binop(Iop_Sub32, mkexpr(argL), mkexpr(argR)),
+                            binop(Iop_Xor32, mkexpr(oldC), mkU32(1)) ));
+               putIRegT(rD, mkexpr(res), condT);
+               if (bS)
+                  setFlags_D1_D2_ND( ARMG_CC_OP_SBB,
+                                     argL, argR, oldC, condT );
+               break;
+            default:
+              vassert(0);
+         }
+         DIP("%s%s.w r%u, r%u, #%u\n",
+             nm, bS == 1 ? "s" : "", rD, rN, imm32);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T1) ORR{S}.W Rd, Rn, #constT -------------- */
+   /* -------------- (T1) AND{S}.W Rd, Rn, #constT -------------- */
+   /* -------------- (T1) BIC{S}.W Rd, Rn, #constT -------------- */
+   /* -------------- (T1) EOR{S}.W Rd, Rn, #constT -------------- */
+   if (INSN0(15,11) == BITS5(1,1,1,1,0)
+       && (   INSN0(9,5) == BITS5(0,0,0,1,0)  // ORR
+           || INSN0(9,5) == BITS5(0,0,0,0,0)  // AND
+           || INSN0(9,5) == BITS5(0,0,0,0,1)  // BIC
+           || INSN0(9,5) == BITS5(0,0,1,0,0)  // EOR
+           || INSN0(9,5) == BITS5(0,0,0,1,1)) // ORN
+       && INSN1(15,15) == 0) {
+      UInt bS = INSN0(4,4);
+      UInt rN = INSN0(3,0);
+      UInt rD = INSN1(11,8);
+      if (!isBadRegT(rN) && !isBadRegT(rD)) {
+         Bool   notArgR = False;
+         IROp   op      = Iop_INVALID;
+         HChar* nm      = "???";
+         switch (INSN0(9,5)) {
+            case BITS5(0,0,0,1,0): op = Iop_Or32;  nm = "orr"; break;
+            case BITS5(0,0,0,0,0): op = Iop_And32; nm = "and"; break;
+            case BITS5(0,0,0,0,1): op = Iop_And32; nm = "bic";
+                                   notArgR = True; break;
+            case BITS5(0,0,1,0,0): op = Iop_Xor32; nm = "eor"; break;
+            case BITS5(0,0,0,1,1): op = Iop_Or32;  nm = "orn";
+                                   notArgR = True; break;
+            default: vassert(0);
+         }
+         IRTemp argL  = newTemp(Ity_I32);
+         IRTemp argR  = newTemp(Ity_I32);
+         IRTemp res   = newTemp(Ity_I32);
+         Bool   updC  = False;
+         UInt   imm32 = thumbExpandImm_from_I0_I1(&updC, insn0, insn1);
+         assign(argL, getIRegT(rN));
+         assign(argR, mkU32(notArgR ? ~imm32 : imm32));
+         assign(res,  binop(op, mkexpr(argL), mkexpr(argR)));
+         putIRegT(rD, mkexpr(res), condT);
+         if (bS) {
+            IRTemp oldV = newTemp(Ity_I32);
+            IRTemp oldC = newTemp(Ity_I32);
+            assign( oldV, mk_armg_calculate_flag_v() );
+            assign( oldC, updC 
+                          ? mkU32((imm32 >> 31) & 1)
+                          : mk_armg_calculate_flag_c() );
+            setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, oldC, oldV,
+                               condT );
+         }
+         DIP("%s%s.w r%u, r%u, #%u\n",
+             nm, bS == 1 ? "s" : "", rD, rN, imm32);
+         goto decode_success;
+      }
+   }
+
+   /* ---------- (T3) ADD{S}.W Rd, Rn, Rm, {shift} ---------- */
+   /* ---------- (T3) SUB{S}.W Rd, Rn, Rm, {shift} ---------- */
+   /* ---------- (T3) RSB{S}.W Rd, Rn, Rm, {shift} ---------- */
+   if (INSN0(15,9) == BITS7(1,1,1,0,1,0,1)
+       && (   INSN0(8,5) == BITS4(1,0,0,0)  // add subopc
+           || INSN0(8,5) == BITS4(1,1,0,1)  // sub subopc
+           || INSN0(8,5) == BITS4(1,1,1,0)) // rsb subopc
+       && INSN1(15,15) == 0) {
+      UInt rN   = INSN0(3,0);
+      UInt rD   = INSN1(11,8);
+      UInt rM   = INSN1(3,0);
+      UInt bS   = INSN0(4,4);
+      UInt imm5 = (INSN1(14,12) << 2) | INSN1(7,6);
+      UInt how  = INSN1(5,4);
+
+      Bool valid = !isBadRegT(rD) && !isBadRegT(rN) && !isBadRegT(rM);
+      /* but allow "add.w reg, sp, reg   w/ no shift */
+      if (!valid && INSN0(8,5) == BITS4(1,0,0,0) // add
+          && rN == 13 && imm5 == 0 && how == 0) {
+         valid = True;
+      }
+      /* also allow "sub.w sp, sp, reg   w/ no shift */
+      if (!valid && INSN0(8,5) == BITS4(1,1,0,1) // add
+          && rD == 13 && rN == 13 && imm5 == 0 && how == 0) {
+         valid = True;
+      }
+      if (valid) {
+         Bool   swap = False;
+         IROp   op   = Iop_INVALID;
+         HChar* nm   = "???";
+         switch (INSN0(8,5)) {
+            case BITS4(1,0,0,0): op = Iop_Add32; nm = "add"; break;
+            case BITS4(1,1,0,1): op = Iop_Sub32; nm = "sub"; break;
+            case BITS4(1,1,1,0): op = Iop_Sub32; nm = "rsb"; 
+                                 swap = True; break;
+            default: vassert(0);
+         }
+
+         IRTemp argL = newTemp(Ity_I32);
+         assign(argL, getIRegT(rN));
+
+         IRTemp rMt = newTemp(Ity_I32);
+         assign(rMt, getIRegT(rM));
+
+         IRTemp argR = newTemp(Ity_I32);
+         compute_result_and_C_after_shift_by_imm5(
+            dis_buf, &argR, NULL, rMt, how, imm5, rM
+         );
+
+         IRTemp res = newTemp(Ity_I32);
+         assign(res, swap 
+                     ? binop(op, mkexpr(argR), mkexpr(argL))
+                     : binop(op, mkexpr(argL), mkexpr(argR)));
+
+         putIRegT(rD, mkexpr(res), condT);
+         if (bS) {
+            switch (op) {
+               case Iop_Add32:
+                  setFlags_D1_D2( ARMG_CC_OP_ADD, argL, argR, condT );
+                  break;
+               case Iop_Sub32:
+                  if (swap)
+                     setFlags_D1_D2( ARMG_CC_OP_SUB, argR, argL, condT );
+                  else
+                     setFlags_D1_D2( ARMG_CC_OP_SUB, argL, argR, condT );
+                  break;
+               default:
+                  vassert(0);
+            }
+         }
+
+         DIP("%s%s.w r%u, r%u, %s\n",
+             nm, bS ? "s" : "", rD, rN, dis_buf);
+         goto decode_success;
+      }
+   }
+
+   /* ---------- (T3) ADC{S}.W Rd, Rn, Rm, {shift} ---------- */
+   /* ---------- (T2) SBC{S}.W Rd, Rn, Rm, {shift} ---------- */
+   if (INSN0(15,9) == BITS7(1,1,1,0,1,0,1)
+       && (   INSN0(8,5) == BITS4(1,0,1,0)   // adc subopc
+           || INSN0(8,5) == BITS4(1,0,1,1))  // sbc subopc
+       && INSN1(15,15) == 0) {
+      /* ADC:  Rd = Rn + shifter_operand + oldC */
+      /* SBC:  Rd = Rn - shifter_operand - (oldC ^ 1) */
+      UInt rN = INSN0(3,0);
+      UInt rD = INSN1(11,8);
+      UInt rM = INSN1(3,0);
+      if (!isBadRegT(rD) && !isBadRegT(rN) && !isBadRegT(rM)) {
+         UInt bS   = INSN0(4,4);
+         UInt imm5 = (INSN1(14,12) << 2) | INSN1(7,6);
+         UInt how  = INSN1(5,4);
+
+         IRTemp argL = newTemp(Ity_I32);
+         assign(argL, getIRegT(rN));
+
+         IRTemp rMt = newTemp(Ity_I32);
+         assign(rMt, getIRegT(rM));
+
+         IRTemp oldC = newTemp(Ity_I32);
+         assign(oldC, mk_armg_calculate_flag_c());
+
+         IRTemp argR = newTemp(Ity_I32);
+         compute_result_and_C_after_shift_by_imm5(
+            dis_buf, &argR, NULL, rMt, how, imm5, rM
+         );
+
+         HChar* nm  = "???";
+         IRTemp res = newTemp(Ity_I32);
+         switch (INSN0(8,5)) {
+            case BITS4(1,0,1,0): // ADC
+               nm = "adc";
+               assign(res,
+                      binop(Iop_Add32,
+                            binop(Iop_Add32, mkexpr(argL), mkexpr(argR)),
+                            mkexpr(oldC) ));
+               putIRegT(rD, mkexpr(res), condT);
+               if (bS)
+                  setFlags_D1_D2_ND( ARMG_CC_OP_ADC,
+                                     argL, argR, oldC, condT );
+               break;
+            case BITS4(1,0,1,1): // SBC
+               nm = "sbc";
+               assign(res,
+                      binop(Iop_Sub32,
+                            binop(Iop_Sub32, mkexpr(argL), mkexpr(argR)),
+                            binop(Iop_Xor32, mkexpr(oldC), mkU32(1)) ));
+               putIRegT(rD, mkexpr(res), condT);
+               if (bS)
+                  setFlags_D1_D2_ND( ARMG_CC_OP_SBB,
+                                     argL, argR, oldC, condT );
+               break;
+            default:
+               vassert(0);
+         }
+
+         DIP("%s%s.w r%u, r%u, %s\n",
+             nm, bS ? "s" : "", rD, rN, dis_buf);
+         goto decode_success;
+      }
+   }
+
+   /* ---------- (T3) AND{S}.W Rd, Rn, Rm, {shift} ---------- */
+   /* ---------- (T3) ORR{S}.W Rd, Rn, Rm, {shift} ---------- */
+   /* ---------- (T3) EOR{S}.W Rd, Rn, Rm, {shift} ---------- */
+   /* ---------- (T3) BIC{S}.W Rd, Rn, Rm, {shift} ---------- */
+   /* ---------- (T1) ORN{S}.W Rd, Rn, Rm, {shift} ---------- */
+   if (INSN0(15,9) == BITS7(1,1,1,0,1,0,1)
+       && (   INSN0(8,5) == BITS4(0,0,0,0)  // and subopc
+           || INSN0(8,5) == BITS4(0,0,1,0)  // orr subopc
+           || INSN0(8,5) == BITS4(0,1,0,0)  // eor subopc
+           || INSN0(8,5) == BITS4(0,0,0,1)  // bic subopc
+           || INSN0(8,5) == BITS4(0,0,1,1)) // orn subopc
+       && INSN1(15,15) == 0) {
+      UInt rN = INSN0(3,0);
+      UInt rD = INSN1(11,8);
+      UInt rM = INSN1(3,0);
+      if (!isBadRegT(rD) && !isBadRegT(rN) && !isBadRegT(rM)) {
+         Bool notArgR = False;
+         IROp op      = Iop_INVALID;
+         HChar* nm  = "???";
+         switch (INSN0(8,5)) {
+            case BITS4(0,0,0,0): op = Iop_And32; nm = "and"; break;
+            case BITS4(0,0,1,0): op = Iop_Or32;  nm = "orr"; break;
+            case BITS4(0,1,0,0): op = Iop_Xor32; nm = "eor"; break;
+            case BITS4(0,0,0,1): op = Iop_And32; nm = "bic";
+                                 notArgR = True; break;
+            case BITS4(0,0,1,1): op = Iop_Or32; nm = "orn";
+                                 notArgR = True; break;
+            default: vassert(0);
+         }
+         UInt bS   = INSN0(4,4);
+         UInt imm5 = (INSN1(14,12) << 2) | INSN1(7,6);
+         UInt how  = INSN1(5,4);
+
+         IRTemp rNt = newTemp(Ity_I32);
+         assign(rNt, getIRegT(rN));
+
+         IRTemp rMt = newTemp(Ity_I32);
+         assign(rMt, getIRegT(rM));
+
+         IRTemp argR = newTemp(Ity_I32);
+         IRTemp oldC = bS ? newTemp(Ity_I32) : IRTemp_INVALID;
+
+         compute_result_and_C_after_shift_by_imm5(
+            dis_buf, &argR, bS ? &oldC : NULL, rMt, how, imm5, rM
+         );
+
+         IRTemp res = newTemp(Ity_I32);
+         if (notArgR) {
+            vassert(op == Iop_And32 || op == Iop_Or32);
+            assign(res, binop(op, mkexpr(rNt),
+                                  unop(Iop_Not32, mkexpr(argR))));
+         } else {
+            assign(res, binop(op, mkexpr(rNt), mkexpr(argR)));
+         }
+
+         putIRegT(rD, mkexpr(res), condT);
+         if (bS) {
+            IRTemp oldV = newTemp(Ity_I32);
+            assign( oldV, mk_armg_calculate_flag_v() );
+            setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, oldC, oldV,
+                               condT );
+         }
+
+         DIP("%s%s.w r%u, r%u, %s\n",
+             nm, bS ? "s" : "", rD, rN, dis_buf);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T?) LSL{S}.W Rd, Rn, Rm -------------- */
+   /* -------------- (T?) LSR{S}.W Rd, Rn, Rm -------------- */
+   /* -------------- (T?) ASR{S}.W Rd, Rn, Rm -------------- */
+   /* -------------- (T?) ROR{S}.W Rd, Rn, Rm -------------- */
+   if (INSN0(15,7) == BITS9(1,1,1,1,1,0,1,0,0)
+       && INSN1(15,12) == BITS4(1,1,1,1)
+       && INSN1(7,4) == BITS4(0,0,0,0)) {
+      UInt how = INSN0(6,5); // standard encoding
+      UInt rN  = INSN0(3,0);
+      UInt rD  = INSN1(11,8);
+      UInt rM  = INSN1(3,0);
+      UInt bS  = INSN0(4,4);
+      Bool valid = !isBadRegT(rN) && !isBadRegT(rM) && !isBadRegT(rD);
+      if (how == 3) valid = False; //ATC
+      if (valid) {
+         IRTemp rNt    = newTemp(Ity_I32);
+         IRTemp rMt    = newTemp(Ity_I32);
+         IRTemp res    = newTemp(Ity_I32);
+         IRTemp oldC   = bS ? newTemp(Ity_I32) : IRTemp_INVALID;
+         IRTemp oldV   = bS ? newTemp(Ity_I32) : IRTemp_INVALID;
+         HChar* nms[4] = { "lsl", "lsr", "asr", "ror" };
+         HChar* nm     = nms[how];
+         assign(rNt, getIRegT(rN));
+         assign(rMt, getIRegT(rM));
+         compute_result_and_C_after_shift_by_reg(
+            dis_buf, &res, bS ? &oldC : NULL,
+            rNt, how, rMt, rN, rM
+         );
+         if (bS)
+            assign(oldV, mk_armg_calculate_flag_v());
+         putIRegT(rD, mkexpr(res), condT);
+         if (bS) {
+            setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, oldC, oldV,
+                               condT );
+         }
+         DIP("%s%s.w r%u, r%u, r%u\n",
+             nm, bS ? "s" : "", rD, rN, rM);
+         goto decode_success;
+      }
+   }
+
+   /* ------------ (T?) MOV{S}.W Rd, Rn, {shift} ------------ */
+   /* ------------ (T?) MVN{S}.W Rd, Rn, {shift} ------------ */
+   if ((INSN0(15,0) & 0xFFCF) == 0xEA4F
+       && INSN1(15,15) == 0) {
+      UInt rD = INSN1(11,8);
+      UInt rN = INSN1(3,0);
+      if (!isBadRegT(rD) && !isBadRegT(rN)) {
+         UInt bS    = INSN0(4,4);
+         UInt isMVN = INSN0(5,5);
+         UInt imm5  = (INSN1(14,12) << 2) | INSN1(7,6);
+         UInt how   = INSN1(5,4);
+
+         IRTemp rNt = newTemp(Ity_I32);
+         assign(rNt, getIRegT(rN));
+
+         IRTemp oldRn = newTemp(Ity_I32);
+         IRTemp oldC  = bS ? newTemp(Ity_I32) : IRTemp_INVALID;
+         compute_result_and_C_after_shift_by_imm5(
+            dis_buf, &oldRn, bS ? &oldC : NULL, rNt, how, imm5, rN
+         );
+
+         IRTemp res = newTemp(Ity_I32);
+         assign(res, isMVN ? unop(Iop_Not32, mkexpr(oldRn))
+                           : mkexpr(oldRn));
+
+         putIRegT(rD, mkexpr(res), condT);
+         if (bS) {
+            IRTemp oldV = newTemp(Ity_I32);
+            assign( oldV, mk_armg_calculate_flag_v() );
+            setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, oldC, oldV, condT);
+         }
+         DIP("%s%s.w r%u, %s\n",
+             isMVN ? "mvn" : "mov", bS ? "s" : "", rD, dis_buf);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T?) TST.W Rn, Rm, {shift} -------------- */
+   /* -------------- (T?) TEQ.W Rn, Rm, {shift} -------------- */
+   if (INSN0(15,9) == BITS7(1,1,1,0,1,0,1)
+       && (   INSN0(8,4) == BITS5(0,0,0,0,1)  // TST
+           || INSN0(8,4) == BITS5(0,1,0,0,1)) // TEQ
+       && INSN1(15,15) == 0
+       && INSN1(11,8) == BITS4(1,1,1,1)) {
+      UInt rN = INSN0(3,0);
+      UInt rM = INSN1(3,0);
+      if (!isBadRegT(rN) && !isBadRegT(rM)) {
+         Bool isTST = INSN0(8,4) == BITS5(0,0,0,0,1);
+
+         UInt how  = INSN1(5,4);
+         UInt imm5 = (INSN1(14,12) << 2) | INSN1(7,6);
+
+         IRTemp argL = newTemp(Ity_I32);
+         assign(argL, getIRegT(rN));
+
+         IRTemp rMt = newTemp(Ity_I32);
+         assign(rMt, getIRegT(rM));
+
+         IRTemp argR = newTemp(Ity_I32);
+         IRTemp oldC = newTemp(Ity_I32);
+         compute_result_and_C_after_shift_by_imm5(
+            dis_buf, &argR, &oldC, rMt, how, imm5, rM
+         );
+
+         IRTemp oldV = newTemp(Ity_I32);
+         assign( oldV, mk_armg_calculate_flag_v() );
+
+         IRTemp res = newTemp(Ity_I32);
+         assign(res, binop(isTST ? Iop_And32 : Iop_Xor32,
+                           mkexpr(argL), mkexpr(argR)));
+
+         setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, oldC, oldV,
+                            condT );
+         DIP("%s.w r%u, %s\n", isTST ? "tst" : "teq", rN, dis_buf);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T3) CMP.W Rn, Rm, {shift} -------------- */
+   /* -------------- (T2) CMN.W Rn, Rm, {shift} -------------- */
+   if (INSN0(15,9) == BITS7(1,1,1,0,1,0,1)
+       && (   INSN0(8,4) == BITS5(1,1,0,1,1)  // CMP
+           || INSN0(8,4) == BITS5(1,0,0,0,1)) // CMN
+       && INSN1(15,15) == 0
+       && INSN1(11,8) == BITS4(1,1,1,1)) {
+      UInt rN = INSN0(3,0);
+      UInt rM = INSN1(3,0);
+      if (!isBadRegT(rN) && !isBadRegT(rM)) {
+         Bool isCMN = INSN0(8,4) == BITS5(1,0,0,0,1);
+         UInt how   = INSN1(5,4);
+         UInt imm5  = (INSN1(14,12) << 2) | INSN1(7,6);
+
+         IRTemp argL = newTemp(Ity_I32);
+         assign(argL, getIRegT(rN));
+
+         IRTemp rMt = newTemp(Ity_I32);
+         assign(rMt, getIRegT(rM));
+
+         IRTemp argR = newTemp(Ity_I32);
+         compute_result_and_C_after_shift_by_imm5(
+            dis_buf, &argR, NULL, rMt, how, imm5, rM
+         );
+
+         setFlags_D1_D2( isCMN ? ARMG_CC_OP_ADD : ARMG_CC_OP_SUB,
+                         argL, argR, condT );
+
+         DIP("%s.w r%u, %s\n", isCMN ? "cmn" : "cmp", rN, dis_buf);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T2) MOV{S}.W Rd, #constT -------------- */
+   /* -------------- (T2) MVN{S}.W Rd, #constT -------------- */
+   if (INSN0(15,11) == BITS5(1,1,1,1,0)
+       && (   INSN0(9,5) == BITS5(0,0,0,1,0)  // MOV
+           || INSN0(9,5) == BITS5(0,0,0,1,1)) // MVN
+       && INSN0(3,0) == BITS4(1,1,1,1)
+       && INSN1(15,15) == 0) {
+      UInt rD = INSN1(11,8);
+      if (!isBadRegT(rD)) {
+         Bool   updC  = False;
+         UInt   bS    = INSN0(4,4);
+         Bool   isMVN = INSN0(5,5) == 1;
+         UInt   imm32 = thumbExpandImm_from_I0_I1(&updC, insn0, insn1);
+         IRTemp res   = newTemp(Ity_I32);
+         assign(res, mkU32(isMVN ? ~imm32 : imm32));
+         putIRegT(rD, mkexpr(res), condT);
+         if (bS) {
+            IRTemp oldV = newTemp(Ity_I32);
+            IRTemp oldC = newTemp(Ity_I32);
+            assign( oldV, mk_armg_calculate_flag_v() );
+            assign( oldC, updC 
+                          ? mkU32((imm32 >> 31) & 1)
+                          : mk_armg_calculate_flag_c() );
+            setFlags_D1_D2_ND( ARMG_CC_OP_LOGIC, res, oldC, oldV,
+                               condT );
+         }
+         DIP("%s%s.w r%u, #%u\n",
+             isMVN ? "mvn" : "mov", bS ? "s" : "", rD, imm32);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T3) MOVW Rd, #imm16 -------------- */
+   if (INSN0(15,11) == BITS5(1,1,1,1,0)
+       && INSN0(9,4) == BITS6(1,0,0,1,0,0)
+       && INSN1(15,15) == 0) {
+      UInt rD = INSN1(11,8);
+      if (!isBadRegT(rD)) {
+         UInt imm16 = (INSN0(3,0) << 12) | (INSN0(10,10) << 11)
+                      | (INSN1(14,12) << 8) | INSN1(7,0);
+         putIRegT(rD, mkU32(imm16), condT);
+         DIP("movw r%u, #%u\n", rD, imm16);
+         goto decode_success;
+      }
+   }
+
+   /* ---------------- MOVT Rd, #imm16 ---------------- */
+   if (INSN0(15,11) == BITS5(1,1,1,1,0)
+       && INSN0(9,4) == BITS6(1,0,1,1,0,0)
+       && INSN1(15,15) == 0) {
+      UInt rD = INSN1(11,8);
+      if (!isBadRegT(rD)) {
+         UInt imm16 = (INSN0(3,0) << 12) | (INSN0(10,10) << 11)
+                      | (INSN1(14,12) << 8) | INSN1(7,0);
+         IRTemp res = newTemp(Ity_I32);
+         assign(res,
+                binop(Iop_Or32,
+                      binop(Iop_And32, getIRegT(rD), mkU32(0xFFFF)),
+                      mkU32(imm16 << 16)));
+         putIRegT(rD, mkexpr(res), condT);
+         DIP("movt r%u, #%u\n", rD, imm16);
+         goto decode_success;
+      }
+   }
+
+   /* ---------------- LD/ST reg+/-#imm8 ---------------- */
+   /* Loads and stores of the form:
+         op  Rt, [Rn, #-imm8]      or
+         op  Rt, [Rn], #+/-imm8    or
+         op  Rt, [Rn, #+/-imm8]!  
+      where op is one of
+         ldrb ldrh ldr  ldrsb ldrsh
+         strb strh str
+   */
+   if (INSN0(15,9) == BITS7(1,1,1,1,1,0,0) && INSN1(11,11) == 1) {
+      Bool   valid  = True;
+      Bool   syned  = False;
+      Bool   isST   = False;
+      IRType ty     = Ity_I8;
+      HChar* nm     = "???";
+
+      switch (INSN0(8,4)) {
+         case BITS5(0,0,0,0,0):   // strb
+            nm = "strb"; isST = True; break;
+         case BITS5(0,0,0,0,1):   // ldrb
+            nm = "ldrb"; break;
+         case BITS5(1,0,0,0,1):   // ldrsb
+            nm = "ldrsb"; syned = True; break;
+         case BITS5(0,0,0,1,0):   // strh
+            nm = "strh"; ty = Ity_I16; isST = True; break;
+         case BITS5(0,0,0,1,1):   // ldrh
+            nm = "ldrh"; ty = Ity_I16; break;
+         case BITS5(1,0,0,1,1):   // ldrsh
+            nm = "ldrsh"; ty = Ity_I16; syned = True; break;
+         case BITS5(0,0,1,0,0):   // str
+            nm = "str"; ty = Ity_I32; isST = True; break;
+         case BITS5(0,0,1,0,1):
+            nm = "ldr"; ty = Ity_I32; break;  // ldr
+         default:
+            valid = False; break;
+      }
+
+      UInt rN      = INSN0(3,0);
+      UInt rT      = INSN1(15,12);
+      UInt bP      = INSN1(10,10);
+      UInt bU      = INSN1(9,9);
+      UInt bW      = INSN1(8,8);
+      UInt imm8    = INSN1(7,0);
+      Bool loadsPC = False;
+
+      if (valid) {
+         if (bP == 1 && bU == 1 && bW == 0)
+            valid = False;
+         if (bP == 0 && bW == 0)
+            valid = False;
+         if (rN == 15)
+            valid = False;
+         if (bW == 1 && rN == rT)
+            valid = False;
+         if (ty == Ity_I8 || ty == Ity_I16) {
+            if (isBadRegT(rT))
+               valid = False;
+         } else {
+            /* ty == Ity_I32 */
+            if (isST && rT == 15)
+               valid = False;
+            if (!isST && rT == 15)
+               loadsPC = True;
+         }
+      }
+
+      if (valid) {
+         // if it's a branch, it can't happen in the middle of an IT block
+         if (loadsPC)
+            gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+         // now uncond
+
+         IRTemp preAddr = newTemp(Ity_I32);
+         assign(preAddr, getIRegT(rN));
+
+         IRTemp postAddr = newTemp(Ity_I32);
+         assign(postAddr, binop(bU == 1 ? Iop_Add32 : Iop_Sub32,
+                                mkexpr(preAddr), mkU32(imm8)));
+
+         IRTemp transAddr = bP == 1 ? postAddr : preAddr;
+
+         if (isST) {
+
+             /* Store.  If necessary, update the base register before
+                the store itself, so that the common idiom of "str rX,
+                [sp, #-4]!" (store rX at sp-4, then do new sp = sp-4,
+                a.k.a "push rX") doesn't cause Memcheck to complain
+                that the access is below the stack pointer.  Also, not
+                updating sp before the store confuses Valgrind's
+                dynamic stack-extending logic.  So do it before the
+                store.  Hence we need to snarf the store data before
+                doing the basereg update. */
+
+            /* get hold of the data to be stored */
+            IRTemp oldRt = newTemp(Ity_I32);
+            assign(oldRt, getIRegT(rT));
+
+            /* Update Rn if necessary. */
+            if (bW == 1) {
+               vassert(rN != rT); // assured by validity check above
+               putIRegT(rN, mkexpr(postAddr), IRTemp_INVALID);
+            }
+
+            /* generate the transfer */
+            switch (ty) {
+               case Ity_I8:
+                  storeLE(mkexpr(transAddr),
+                                 unop(Iop_32to8, mkexpr(oldRt)));
+                  break;
+               case Ity_I16:
+                  storeLE(mkexpr(transAddr),
+                          unop(Iop_32to16, mkexpr(oldRt)));
+                  break;
+              case Ity_I32:
+                  storeLE(mkexpr(transAddr), mkexpr(oldRt));
+                  break;
+              default:
+                 vassert(0);
+            }
+
+         } else {
+
+            /* Load. */
+
+            /* generate the transfer */
+            IRTemp newRt = newTemp(Ity_I32);
+            IROp   widen = Iop_INVALID;
+            switch (ty) {
+               case Ity_I8:
+                  widen = syned ? Iop_8Sto32 : Iop_8Uto32; break;
+               case Ity_I16:
+                  widen = syned ? Iop_16Sto32 : Iop_16Uto32; break;
+               case Ity_I32:
+                  break;
+               default:
+                  vassert(0);
+            }
+            if (widen == Iop_INVALID) {
+               assign(newRt, loadLE(ty, mkexpr(transAddr)));
+            } else {
+               assign(newRt, unop(widen, loadLE(ty, mkexpr(transAddr))));
+            }
+            if (loadsPC) {
+               vassert(rT == 15);
+               llPutIReg(rT, mkexpr(newRt));
+            } else {
+               putIRegT(rT, mkexpr(newRt), IRTemp_INVALID);
+            }
+
+            if (loadsPC) {
+               /* Presumably this is an interworking branch. */
+               irsb->next = mkexpr(newRt);
+               irsb->jumpkind = Ijk_Boring;  /* or _Ret ? */
+               dres.whatNext  = Dis_StopHere;
+            }
+
+            /* Update Rn if necessary. */
+            if (bW == 1) {
+               vassert(rN != rT); // assured by validity check above
+               putIRegT(rN, mkexpr(postAddr), IRTemp_INVALID);
+            }
+         }
+
+         if (bP == 1 && bW == 0) {
+            DIP("%s.w r%u, [r%u, #%c%u]\n",
+                nm, rT, rN, bU ? '+' : '-', imm8);
+         }
+         else if (bP == 1 && bW == 1) {
+            DIP("%s.w r%u, [r%u, #%c%u]!\n",
+                nm, rT, rN, bU ? '+' : '-', imm8);
+         }
+         else {
+            vassert(bP == 0 && bW == 1);
+            DIP("%s.w r%u, [r%u], #%c%u\n",
+                nm, rT, rN, bU ? '+' : '-', imm8);
+         }
+
+         goto decode_success;
+      }
+   }
+
+   /* ------------- LD/ST reg+(reg<<imm2) ------------- */
+   /* Loads and stores of the form:
+         op  Rt, [Rn, Rm, LSL #imm8]
+      where op is one of
+         ldrb ldrh ldr  ldrsb ldrsh
+         strb strh str
+   */
+   if (INSN0(15,9) == BITS7(1,1,1,1,1,0,0)
+       && INSN1(11,6) == BITS6(0,0,0,0,0,0)) {
+      Bool   valid  = True;
+      Bool   syned  = False;
+      Bool   isST   = False;
+      IRType ty     = Ity_I8;
+      HChar* nm     = "???";
+
+      switch (INSN0(8,4)) {
+         case BITS5(0,0,0,0,0):   // strb
+            nm = "strb"; isST = True; break;
+         case BITS5(0,0,0,0,1):   // ldrb
+            nm = "ldrb"; break;
+         case BITS5(1,0,0,0,1):   // ldrsb
+            nm = "ldrsb"; syned = True; break;
+         case BITS5(0,0,0,1,0):   // strh
+            nm = "strh"; ty = Ity_I16; isST = True; break;
+         case BITS5(0,0,0,1,1):   // ldrh
+            nm = "ldrh"; ty = Ity_I16; break;
+         case BITS5(1,0,0,1,1):   // ldrsh
+            nm = "ldrsh"; ty = Ity_I16; syned = True; break;
+         case BITS5(0,0,1,0,0):   // str
+            nm = "str"; ty = Ity_I32; isST = True; break;
+         case BITS5(0,0,1,0,1):
+            nm = "ldr"; ty = Ity_I32; break;  // ldr
+         default:
+            valid = False; break;
+      }
+
+      UInt rN      = INSN0(3,0);
+      UInt rM      = INSN1(3,0);
+      UInt rT      = INSN1(15,12);
+      UInt imm2    = INSN1(5,4);
+      Bool loadsPC = False;
+
+      if (ty == Ity_I8 || ty == Ity_I16) {
+         /* all 8- and 16-bit load and store cases have the
+            same exclusion set. */
+         if (rN == 15 || isBadRegT(rT) || isBadRegT(rM))
+            valid = False;
+      } else {
+         vassert(ty == Ity_I32);
+         if (rN == 15 || isBadRegT(rM))
+            valid = False;
+         if (isST && rT == 15)
+            valid = False;
+         /* If it is a load and rT is 15, that's only allowable if we
+            not in an IT block, or are the last in it.  Need to insert
+            a dynamic check for that. */
+         if (!isST && rT == 15)
+            loadsPC = True;
+      }
+
+      if (valid) {
+         // if it's a branch, it can't happen in the middle of an IT block
+         if (loadsPC)
+            gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+         // now uncond
+
+         IRTemp transAddr = newTemp(Ity_I32);
+         assign(transAddr,
+                binop( Iop_Add32,
+                       getIRegT(rN),
+                       binop(Iop_Shl32, getIRegT(rM), mkU8(imm2)) ));
+
+         if (isST) {
+            IRTemp oldRt = newTemp(Ity_I32);
+            assign(oldRt, getIRegT(rT));
+            switch (ty) {
+               case Ity_I8:
+                  storeLE(mkexpr(transAddr),
+                                 unop(Iop_32to8, mkexpr(oldRt)));
+                  break;
+               case Ity_I16:
+                  storeLE(mkexpr(transAddr),
+                          unop(Iop_32to16, mkexpr(oldRt)));
+                  break;
+              case Ity_I32:
+                  storeLE(mkexpr(transAddr), mkexpr(oldRt));
+                  break;
+              default:
+                 vassert(0);
+            }
+         } else {
+            IRTemp newRt = newTemp(Ity_I32);
+            IROp   widen = Iop_INVALID;
+            switch (ty) {
+               case Ity_I8:
+                  widen = syned ? Iop_8Sto32 : Iop_8Uto32; break;
+               case Ity_I16:
+                  widen = syned ? Iop_16Sto32 : Iop_16Uto32; break;
+               case Ity_I32:
+                  break;
+               default:
+                  vassert(0);
+            }
+            if (widen == Iop_INVALID) {
+               assign(newRt, loadLE(ty, mkexpr(transAddr)));
+            } else {
+               assign(newRt, unop(widen, loadLE(ty, mkexpr(transAddr))));
+            }
+
+            /* If we're loading the PC, putIRegT will assert.  So go
+               direct via llPutIReg.  In all other cases use putIRegT
+               as it is safer (although could simply use llPutIReg for
+               _all_ cases here.) */
+            if (loadsPC) {
+               vassert(rT == 15);
+               llPutIReg(rT, mkexpr(newRt));
+            } else {
+               putIRegT(rT, mkexpr(newRt), IRTemp_INVALID);
+            }
+
+            if (loadsPC) {
+               /* Presumably this is an interworking branch. */
+               irsb->next = mkexpr(newRt);
+               irsb->jumpkind = Ijk_Boring;  /* or _Ret ? */
+               dres.whatNext  = Dis_StopHere;
+            }
+         }
+
+         DIP("%s.w r%u, [r%u, r%u, LSL #%u]\n",
+             nm, rT, rN, rM, imm2);
+
+         goto decode_success;
+      }
+   }
+
+   /* --------------- LD/ST reg+imm12 --------------- */
+   /* Loads and stores of the form:
+         op  Rt, [Rn, +#imm12]
+      where op is one of
+         ldrb ldrh ldr  ldrsb ldrsh
+         strb strh str
+   */
+   if (INSN0(15,9) == BITS7(1,1,1,1,1,0,0)) {
+      Bool   valid  = True;
+      Bool   syned  = False;
+      Bool   isST   = False;
+      IRType ty     = Ity_I8;
+      HChar* nm     = "???";
+
+      switch (INSN0(8,4)) {
+         case BITS5(0,1,0,0,0):   // strb
+            nm = "strb"; isST = True; break;
+         case BITS5(0,1,0,0,1):   // ldrb
+            nm = "ldrb"; break;
+         case BITS5(1,1,0,0,1):   // ldrsb
+            nm = "ldrsb"; syned = True; break;
+         case BITS5(0,1,0,1,0):   // strh
+            nm = "strh"; ty = Ity_I16; isST = True; break;
+         case BITS5(0,1,0,1,1):   // ldrh
+            nm = "ldrh"; ty = Ity_I16; break;
+         case BITS5(1,1,0,1,1):   // ldrsh
+            nm = "ldrsh"; ty = Ity_I16; syned = True; break;
+         case BITS5(0,1,1,0,0):   // str
+            nm = "str"; ty = Ity_I32; isST = True; break;
+         case BITS5(0,1,1,0,1):
+            nm = "ldr"; ty = Ity_I32; break;  // ldr
+         default:
+            valid = False; break;
+      }
+
+      UInt rN      = INSN0(3,0);
+      UInt rT      = INSN1(15,12);
+      UInt imm12   = INSN1(11,0);
+      Bool loadsPC = False;
+
+      if (ty == Ity_I8 || ty == Ity_I16) {
+         /* all 8- and 16-bit load and store cases have the
+            same exclusion set. */
+         if (rN == 15 || isBadRegT(rT))
+            valid = False;
+      } else {
+         vassert(ty == Ity_I32);
+         if (isST) {
+            if (rN == 15 || rT == 15)
+               valid = False;
+         } else {
+            /* For a 32-bit load, rT == 15 is only allowable if we not
+               in an IT block, or are the last in it.  Need to insert
+               a dynamic check for that.  Also, in this particular
+               case, rN == 15 is allowable.  In this case however, the
+               value obtained for rN is (apparently)
+               "word-align(address of current insn + 4)". */
+            if (rT == 15)
+               loadsPC = True;
+         }
+      }
+
+      if (valid) {
+         // if it's a branch, it can't happen in the middle of an IT block
+         if (loadsPC)
+            gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+         // now uncond
+
+         IRTemp rNt = newTemp(Ity_I32);
+         if (rN == 15) {
+            vassert(ty == Ity_I32 && !isST);
+            assign(rNt, binop(Iop_And32, getIRegT(rN), mkU32(~3)));
+         } else {
+            assign(rNt, getIRegT(rN));
+         }
+
+         IRTemp transAddr = newTemp(Ity_I32);
+         assign(transAddr,
+                binop( Iop_Add32, mkexpr(rNt), mkU32(imm12) ));
+
+         if (isST) {
+            IRTemp oldRt = newTemp(Ity_I32);
+            assign(oldRt, getIRegT(rT));
+            switch (ty) {
+               case Ity_I8:
+                  storeLE(mkexpr(transAddr),
+                                 unop(Iop_32to8, mkexpr(oldRt)));
+                  break;
+               case Ity_I16:
+                  storeLE(mkexpr(transAddr),
+                          unop(Iop_32to16, mkexpr(oldRt)));
+                  break;
+              case Ity_I32:
+                  storeLE(mkexpr(transAddr), mkexpr(oldRt));
+                  break;
+              default:
+                 vassert(0);
+            }
+         } else {
+            IRTemp newRt = newTemp(Ity_I32);
+            IROp   widen = Iop_INVALID;
+            switch (ty) {
+               case Ity_I8:
+                  widen = syned ? Iop_8Sto32 : Iop_8Uto32; break;
+               case Ity_I16:
+                  widen = syned ? Iop_16Sto32 : Iop_16Uto32; break;
+               case Ity_I32:
+                  break;
+               default:
+                  vassert(0);
+            }
+            if (widen == Iop_INVALID) {
+               assign(newRt, loadLE(ty, mkexpr(transAddr)));
+            } else {
+               assign(newRt, unop(widen, loadLE(ty, mkexpr(transAddr))));
+            }
+            putIRegT(rT, mkexpr(newRt), IRTemp_INVALID);
+
+            if (loadsPC) {
+               /* Presumably this is an interworking branch. */
+               irsb->next = mkexpr(newRt);
+               irsb->jumpkind = Ijk_Boring;  /* or _Ret ? */
+               dres.whatNext  = Dis_StopHere;
+            }
+         }
+
+         DIP("%s.w r%u, [r%u, +#%u]\n", nm, rT, rN, imm12);
+
+         goto decode_success;
+      }
+   }
+
+   /* -------------- LDRD/STRD reg+/-#imm8 -------------- */
+   /* Doubleword loads and stores of the form:
+         ldrd/strd  Rt, Rt2, [Rn, #-imm8]      or
+         ldrd/strd  Rt, Rt2, [Rn], #+/-imm8    or
+         ldrd/strd  Rt, Rt2, [Rn, #+/-imm8]!  
+   */
+   if (INSN0(15,9) == BITS7(1,1,1,0,1,0,0) && INSN0(6,6) == 1) {
+      UInt bP   = INSN0(8,8);
+      UInt bU   = INSN0(7,7);
+      UInt bW   = INSN0(5,5);
+      UInt bL   = INSN0(4,4);  // 1: load  0: store
+      UInt rN   = INSN0(3,0);
+      UInt rT   = INSN1(15,12);
+      UInt rT2  = INSN1(11,8);
+      UInt imm8 = INSN1(7,0);
+
+      Bool valid = True;
+      if (bP == 0 && bW == 0)                 valid = False;
+      if (bW == 1 && (rN == rT || rN == rT2)) valid = False;
+      if (isBadRegT(rT) || isBadRegT(rT2))    valid = False;
+      if (rN == 15)                           valid = False;
+      if (bL == 1 && rT == rT2)               valid = False;
+
+      if (valid) {
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+         // now uncond
+
+         IRTemp preAddr = newTemp(Ity_I32);
+         assign(preAddr, getIRegT(rN));
+
+         IRTemp postAddr = newTemp(Ity_I32);
+         assign(postAddr, binop(bU == 1 ? Iop_Add32 : Iop_Sub32,
+                                mkexpr(preAddr), mkU32(imm8 << 2)));
+
+         IRTemp transAddr = bP == 1 ? postAddr : preAddr;
+
+         if (bL == 0) {
+            IRTemp oldRt  = newTemp(Ity_I32);
+            IRTemp oldRt2 = newTemp(Ity_I32);
+            assign(oldRt,  getIRegT(rT));
+            assign(oldRt2, getIRegT(rT2));
+            storeLE(mkexpr(transAddr),
+                    mkexpr(oldRt));
+            storeLE(binop(Iop_Add32, mkexpr(transAddr), mkU32(4)),
+                    mkexpr(oldRt2));
+         } else {
+            IRTemp newRt  = newTemp(Ity_I32);
+            IRTemp newRt2 = newTemp(Ity_I32);
+            assign(newRt,
+                   loadLE(Ity_I32,
+                          mkexpr(transAddr)));
+            assign(newRt2,
+                   loadLE(Ity_I32,
+                          binop(Iop_Add32, mkexpr(transAddr), mkU32(4))));
+            putIRegT(rT,  mkexpr(newRt), IRTemp_INVALID);
+            putIRegT(rT2, mkexpr(newRt2), IRTemp_INVALID);
+         }
+
+         if (bW == 1) {
+            putIRegT(rN, mkexpr(postAddr), IRTemp_INVALID);
+         }
+
+         HChar* nm = bL ? "ldrd" : "strd";
+
+         if (bP == 1 && bW == 0) {
+            DIP("%s.w r%u, r%u, [r%u, #%c%u]\n",
+                nm, rT, rT2, rN, bU ? '+' : '-', imm8 << 2);
+         }
+         else if (bP == 1 && bW == 1) {
+            DIP("%s.w r%u, r%u, [r%u, #%c%u]!\n",
+                nm, rT, rT2, rN, bU ? '+' : '-', imm8 << 2);
+         }
+         else {
+            vassert(bP == 0 && bW == 1);
+            DIP("%s.w r%u, r%u, [r%u], #%c%u\n",
+                nm, rT, rT2, rN, bU ? '+' : '-', imm8 << 2);
+         }
+
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T3) Bcond.W label -------------- */
+   /* This variant carries its own condition, so can't be part of an
+      IT block ... */
+   if (INSN0(15,11) == BITS5(1,1,1,1,0)
+       && INSN1(15,14) == BITS2(1,0)
+       && INSN1(12,12) == 0) {
+      UInt cond = INSN0(9,6);
+      if (cond != ARMCondAL && cond != ARMCondNV) {
+         Int simm21
+            =   (INSN0(10,10) << (1 + 1 + 6 + 11 + 1))
+              | (INSN1(11,11) << (1 + 6 + 11 + 1))
+              | (INSN1(13,13) << (6 + 11 + 1))
+              | (INSN0(5,0)   << (11 + 1))
+              | (INSN1(10,0)  << 1);
+         simm21 = (simm21 << 11) >> 11;
+
+         vassert(0 == (guest_R15_curr_instr_notENC & 1));
+         UInt dst = simm21 + guest_R15_curr_instr_notENC + 4;
+
+         /* Not allowed in an IT block; SIGILL if so. */
+         gen_SIGILL_T_if_in_ITBlock(old_itstate, new_itstate);
+
+         IRTemp kondT = newTemp(Ity_I32);
+         assign( kondT, mk_armg_calculate_condition(cond) );
+         stmt( IRStmt_Exit( unop(Iop_32to1, mkexpr(kondT)),
+                            Ijk_Boring,
+                            IRConst_U32(dst | 1/*CPSR.T*/) ));
+         irsb->next = mkU32( (guest_R15_curr_instr_notENC + 4) 
+                             | 1 /*CPSR.T*/ );
+         irsb->jumpkind = Ijk_Boring;
+         dres.whatNext  = Dis_StopHere;
+         DIP("b%s.w 0x%x\n", nCC(cond), dst);
+         goto decode_success;
+      }
+   }
+
+   /* ---------------- (T4) B.W label ---------------- */
+   /* ... whereas this variant doesn't carry its own condition, so it
+      has to be either unconditional or the conditional by virtue of
+      being the last in an IT block.  The upside is that there's 4
+      more bits available for the jump offset, so it has a 16-times
+      greater branch range than the T3 variant. */
+   if (INSN0(15,11) == BITS5(1,1,1,1,0)
+       && INSN1(15,14) == BITS2(1,0)
+       && INSN1(12,12) == 1) {
+      if (1) {
+         UInt bS  = INSN0(10,10);
+         UInt bJ1 = INSN1(13,13);
+         UInt bJ2 = INSN1(11,11);
+         UInt bI1 = 1 ^ (bJ1 ^ bS);
+         UInt bI2 = 1 ^ (bJ2 ^ bS);
+         Int simm25
+            =   (bS          << (1 + 1 + 10 + 11 + 1))
+              | (bI1         << (1 + 10 + 11 + 1))
+              | (bI2         << (10 + 11 + 1))
+              | (INSN0(9,0)  << (11 + 1))
+              | (INSN1(10,0) << 1);
+         simm25 = (simm25 << 7) >> 7;
+
+         vassert(0 == (guest_R15_curr_instr_notENC & 1));
+         UInt dst = simm25 + guest_R15_curr_instr_notENC + 4;
+
+         /* If in an IT block, must be the last insn. */
+         gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
+
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+         // now uncond
+
+         // branch to dst
+         irsb->next = mkU32( dst | 1 /*CPSR.T*/ );
+         irsb->jumpkind = Ijk_Boring;
+         dres.whatNext  = Dis_StopHere;
+         DIP("b.w 0x%x\n", dst);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------ TBB, TBH ------------------ */
+   if (INSN0(15,4) == 0xE8D && INSN1(15,5) == 0x780) {
+      UInt rN = INSN0(3,0);
+      UInt rM = INSN1(3,0);
+      UInt bH = INSN1(4,4);
+      if (bH/*ATC*/ || (rN != 13 && !isBadRegT(rM))) {
+         /* Must be last or not-in IT block */
+         gen_SIGILL_T_if_in_but_NLI_ITBlock(old_itstate, new_itstate);
+         /* Go uncond */
+         mk_skip_over_T32_if_cond_is_false(condT);
+         condT = IRTemp_INVALID;
+
+         IRExpr* ea
+             = binop(Iop_Add32,
+                     getIRegT(rN),
+                     bH ? binop(Iop_Shl32, getIRegT(rM), mkU8(1))
+                        : getIRegT(rM));
+
+         IRTemp delta = newTemp(Ity_I32);
+         if (bH) {
+            assign(delta, unop(Iop_16Uto32, loadLE(Ity_I16, ea)));
+         } else {
+            assign(delta, unop(Iop_8Uto32, loadLE(Ity_I8, ea)));
+         }
+
+         irsb->next
+            = binop(Iop_Or32,
+                    binop(Iop_Add32,
+                          getIRegT(15),
+                          binop(Iop_Shl32, mkexpr(delta), mkU8(1))
+                    ),
+                    mkU32(1)
+              );
+         irsb->jumpkind = Ijk_Boring;
+         dres.whatNext = Dis_StopHere;
+         DIP("tb%c [r%u, r%u%s]\n",
+             bH ? 'h' : 'b', rN, rM, bH ? ", LSL #1" : "");
+         goto decode_success;
+      }
+   }
+
+   /* ------------------ UBFX ------------------ */
+   /* ------------------ SBFX ------------------ */
+   /* There's also ARM versions of same, but it doesn't seem worth the
+      hassle to common up the handling (it's only a couple of C
+      statements). */
+   if ((INSN0(15,4) == 0xF3C // UBFX
+        || INSN0(15,4) == 0xF34) // SBFX
+       && INSN1(15,15) == 0 && INSN1(5,5) == 0) {
+      UInt rN  = INSN0(3,0);
+      UInt rD  = INSN1(11,8);
+      UInt lsb = (INSN1(14,12) << 2) | INSN1(7,6);
+      UInt wm1 = INSN1(4,0);
+      UInt msb =  lsb + wm1;
+      if (!isBadRegT(rD) && !isBadRegT(rN) && msb <= 31) {
+         Bool   isU  = INSN0(15,4) == 0xF3C;
+         IRTemp src  = newTemp(Ity_I32);
+         IRTemp tmp  = newTemp(Ity_I32);
+         IRTemp res  = newTemp(Ity_I32);
+         UInt   mask = ((1 << wm1) - 1) + (1 << wm1);
+         vassert(msb >= 0 && msb <= 31);
+         vassert(mask != 0); // guaranteed by msb being in 0 .. 31 inclusive
+
+         assign(src, getIRegT(rN));
+         assign(tmp, binop(Iop_And32,
+                           binop(Iop_Shr32, mkexpr(src), mkU8(lsb)),
+                           mkU32(mask)));
+         assign(res, binop(isU ? Iop_Shr32 : Iop_Sar32,
+                           binop(Iop_Shl32, mkexpr(tmp), mkU8(31-wm1)),
+                           mkU8(31-wm1)));
+
+         putIRegT(rD, mkexpr(res), condT);
+
+         DIP("%s r%u, r%u, #%u, #%u\n",
+             isU ? "ubfx" : "sbfx", rD, rN, lsb, wm1 + 1);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------ UXTB ------------------ */
+   /* ------------------ UXTH ------------------ */
+   /* ------------------ SXTB ------------------ */
+   /* ------------------ SXTH ------------------ */
+   /* ----------------- UXTB16 ----------------- */
+   /* ----------------- SXTB16 ----------------- */
+   /* FIXME: this is an exact duplicate of the ARM version.  They
+      should be commoned up. */
+   if ((INSN0(15,0) == 0xFA5F     // UXTB
+        || INSN0(15,0) == 0xFA1F  // UXTH
+        || INSN0(15,0) == 0xFA4F  // SXTB
+        || INSN0(15,0) == 0xFA0F  // SXTH
+        || INSN0(15,0) == 0xFA3F  // UXTB16
+        || INSN0(15,0) == 0xFA2F) // SXTB16
+       && INSN1(15,12) == BITS4(1,1,1,1)
+       && INSN1(7,6) == BITS2(1,0)) {
+      UInt rD = INSN1(11,8);
+      UInt rM = INSN1(3,0);
+      UInt rot = INSN1(5,4);
+      if (!isBadRegT(rD) && !isBadRegT(rM)) {
+         HChar* nm = "???";
+         IRTemp srcT = newTemp(Ity_I32);
+         IRTemp rotT = newTemp(Ity_I32);
+         IRTemp dstT = newTemp(Ity_I32);
+         assign(srcT, getIRegT(rM));
+         assign(rotT, genROR32(srcT, 8 * rot));
+         switch (INSN0(15,0)) {
+            case 0xFA5F: // UXTB
+               nm = "uxtb";
+               assign(dstT, unop(Iop_8Uto32,
+                                 unop(Iop_32to8, mkexpr(rotT))));
+               break;
+            case 0xFA1F: // UXTH
+               nm = "uxth";
+               assign(dstT, unop(Iop_16Uto32,
+                                 unop(Iop_32to16, mkexpr(rotT))));
+               break;
+            case 0xFA4F: // SXTB
+               nm = "sxtb";
+               assign(dstT, unop(Iop_8Sto32,
+                                 unop(Iop_32to8, mkexpr(rotT))));
+               break;
+            case 0xFA0F: // SXTH
+               nm = "sxth";
+               assign(dstT, unop(Iop_16Sto32,
+                                 unop(Iop_32to16, mkexpr(rotT))));
+               break;
+            case 0xFA3F: // UXTB16
+               nm = "uxtb16";
+               assign(dstT, binop(Iop_And32, mkexpr(rotT),
+                                             mkU32(0x00FF00FF)));
+               break;
+            case 0xFA2F: { // SXTB16
+               nm = "sxtb16";
+               IRTemp lo32 = newTemp(Ity_I32);
+               IRTemp hi32 = newTemp(Ity_I32);
+               assign(lo32, binop(Iop_And32, mkexpr(rotT), mkU32(0xFF)));
+               assign(hi32, binop(Iop_Shr32, mkexpr(rotT), mkU8(16)));
+               assign(
+                  dstT,
+                  binop(Iop_Or32,
+                        binop(Iop_And32,
+                              unop(Iop_8Sto32,
+                                   unop(Iop_32to8, mkexpr(lo32))),
+                              mkU32(0xFFFF)),
+                        binop(Iop_Shl32,
+                              unop(Iop_8Sto32,
+                                   unop(Iop_32to8, mkexpr(hi32))),
+                              mkU8(16))
+               ));
+               break;
+            }
+            default:
+               vassert(0);
+         }
+         putIRegT(rD, mkexpr(dstT), condT);
+         DIP("%s r%u, r%u, ror #%u\n", nm, rD, rM, 8 * rot);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- MUL.W Rd, Rn, Rm -------------- */
+   if (INSN0(15,4) == 0xFB0
+       && (INSN1(15,0) & 0xF0F0) == 0xF000) {
+      UInt rN = INSN0(3,0);
+      UInt rD = INSN1(11,8);
+      UInt rM = INSN1(3,0);
+      if (!isBadRegT(rD) && !isBadRegT(rN) && !isBadRegT(rM)) {
+         IRTemp res = newTemp(Ity_I32);
+         assign(res, binop(Iop_Mul32, getIRegT(rN), getIRegT(rM)));
+         putIRegT(rD, mkexpr(res), condT);
+         DIP("mul.w r%u, r%u, r%u\n", rD, rN, rM);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------ {U,S}MULL ------------------ */
+   if ((INSN0(15,4) == 0xFB8 || INSN0(15,4) == 0xFBA)
+       && INSN1(7,4) == BITS4(0,0,0,0)) {
+      UInt isU  = INSN0(5,5);
+      UInt rN   = INSN0(3,0);
+      UInt rDlo = INSN1(15,12);
+      UInt rDhi = INSN1(11,8);
+      UInt rM   = INSN1(3,0);
+      if (!isBadRegT(rDhi) && !isBadRegT(rDlo)
+          && !isBadRegT(rN) && !isBadRegT(rM) && rDlo != rDhi) {
+         IRTemp res   = newTemp(Ity_I64);
+         assign(res, binop(isU ? Iop_MullU32 : Iop_MullS32,
+                           getIRegT(rN), getIRegT(rM)));
+         putIRegT( rDhi, unop(Iop_64HIto32, mkexpr(res)), condT );
+         putIRegT( rDlo, unop(Iop_64to32, mkexpr(res)), condT );
+         DIP("%cmull r%u, r%u, r%u, r%u\n",
+             isU ? 'u' : 's', rDlo, rDhi, rN, rM);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------ ML{A,S} ------------------ */
+   if (INSN0(15,4) == 0xFB0
+       && (   INSN1(7,4) == BITS4(0,0,0,0)    // MLA
+           || INSN1(7,4) == BITS4(0,0,0,1))) { // MLS
+      UInt rN = INSN0(3,0);
+      UInt rA = INSN1(15,12);
+      UInt rD = INSN1(11,8);
+      UInt rM = INSN1(3,0);
+      if (!isBadRegT(rD) && !isBadRegT(rN)
+          && !isBadRegT(rM) && !isBadRegT(rA)) {
+         Bool   isMLA = INSN1(7,4) == BITS4(0,0,0,0);
+         IRTemp res   = newTemp(Ity_I32);
+         assign(res,
+                binop(isMLA ? Iop_Add32 : Iop_Sub32,
+                      getIRegT(rA),
+                      binop(Iop_Mul32, getIRegT(rN), getIRegT(rM))));
+         putIRegT(rD, mkexpr(res), condT);
+         DIP("%s r%u, r%u, r%u, r%u\n",
+             isMLA ? "mla" : "mls", rD, rN, rM, rA);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------ (T3) ADR ------------------ */
+   if ((INSN0(15,0) == 0xF20F || INSN0(15,0) == 0xF60F)
+       && INSN1(15,15) == 0) {
+      /* rD = align4(PC) + imm32 */
+      UInt rD = INSN1(11,8);
+      if (!isBadRegT(rD)) {
+         UInt imm32 = (INSN0(10,10) << 11)
+                      | (INSN1(14,12) << 8) | INSN1(7,0);
+         putIRegT(rD, binop(Iop_Add32, 
+                            binop(Iop_And32, getIRegT(15), mkU32(~3U)),
+                            mkU32(imm32)),
+                      condT);
+         DIP("add r%u, pc, #%u\n", rD, imm32);
+         goto decode_success;
+      }
+   }
+
+   /* ----------------- (T1) UMLAL ----------------- */
+   /* ----------------- (T1) SMLAL ----------------- */
+   if ((INSN0(15,4) == 0xFBE // UMLAL
+        || INSN0(15,4) == 0xFBC) // SMLAL
+       && INSN1(7,4) == BITS4(0,0,0,0)) {
+      UInt rN   = INSN0(3,0);
+      UInt rDlo = INSN1(15,12);
+      UInt rDhi = INSN1(11,8);
+      UInt rM   = INSN1(3,0);
+      if (!isBadRegT(rDlo) && !isBadRegT(rDhi) && !isBadRegT(rN)
+          && !isBadRegT(rM) && rDhi != rDlo) {
+         Bool   isS   = INSN0(15,4) == 0xFBC;
+         IRTemp argL  = newTemp(Ity_I32);
+         IRTemp argR  = newTemp(Ity_I32);
+         IRTemp old   = newTemp(Ity_I64);
+         IRTemp res   = newTemp(Ity_I64);
+         IRTemp resHi = newTemp(Ity_I32);
+         IRTemp resLo = newTemp(Ity_I32);
+         IROp   mulOp = isS ? Iop_MullS32 : Iop_MullU32;
+         assign( argL, getIRegT(rM));
+         assign( argR, getIRegT(rN));
+         assign( old, binop(Iop_32HLto64, getIRegT(rDhi), getIRegT(rDlo)) );
+         assign( res, binop(Iop_Add64,
+                            mkexpr(old),
+                            binop(mulOp, mkexpr(argL), mkexpr(argR))) );
+         assign( resHi, unop(Iop_64HIto32, mkexpr(res)) );
+         assign( resLo, unop(Iop_64to32, mkexpr(res)) );
+         putIRegT( rDhi, mkexpr(resHi), condT );
+         putIRegT( rDlo, mkexpr(resLo), condT );
+         DIP("%cmlal r%u, r%u, r%u, r%u\n",
+             isS ? 's' : 'u', rDlo, rDhi, rN, rM);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------ (T2) ADR ------------------ */
+   if ((INSN0(15,0) == 0xF2AF || INSN0(15,0) == 0xF6AF)
+       && INSN1(15,15) == 0) {
+      /* rD = align4(PC) - imm32 */
+      UInt rD = INSN1(11,8);
+      if (!isBadRegT(rD)) {
+         UInt imm32 = (INSN0(10,10) << 11)
+                      | (INSN1(14,12) << 8) | INSN1(7,0);
+         putIRegT(rD, binop(Iop_Sub32, 
+                            binop(Iop_And32, getIRegT(15), mkU32(~3U)),
+                            mkU32(imm32)),
+                      condT);
+         DIP("sub r%u, pc, #%u\n", rD, imm32);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- (T1) BFI ------------------- */
+   /* ------------------- (T1) BFC ------------------- */
+   if (INSN0(15,4) == 0xF36 && INSN1(15,15) == 0 && INSN1(5,5) == 0) {
+      UInt rD  = INSN1(11,8);
+      UInt rN  = INSN0(3,0);
+      UInt msb = INSN1(4,0);
+      UInt lsb = (INSN1(14,12) << 2) | INSN1(7,6);
+      if (isBadRegT(rD) || rN == 13 || msb < lsb) {
+         /* undecodable; fall through */
+      } else {
+         IRTemp src    = newTemp(Ity_I32);
+         IRTemp olddst = newTemp(Ity_I32);
+         IRTemp newdst = newTemp(Ity_I32);
+         UInt   mask = 1 << (msb - lsb);
+         mask = (mask - 1) + mask;
+         vassert(mask != 0); // guaranteed by "msb < lsb" check above
+         mask <<= lsb;
+
+         assign(src, rN == 15 ? mkU32(0) : getIRegT(rN));
+         assign(olddst, getIRegT(rD));
+         assign(newdst,
+                binop(Iop_Or32,
+                   binop(Iop_And32,
+                         binop(Iop_Shl32, mkexpr(src), mkU8(lsb)), 
+                         mkU32(mask)),
+                   binop(Iop_And32,
+                         mkexpr(olddst),
+                         mkU32(~mask)))
+               );
+
+         putIRegT(rD, mkexpr(newdst), condT);
+
+         if (rN == 15) {
+            DIP("bfc r%u, #%u, #%u\n",
+                rD, lsb, msb-lsb+1);
+         } else {
+            DIP("bfi r%u, r%u, #%u, #%u\n",
+                rD, rN, lsb, msb-lsb+1);
+         }
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- (T1) SXTAH ------------------- */
+   /* ------------------- (T1) UXTAH ------------------- */
+   if ((INSN0(15,4) == 0xFA1      // UXTAH
+        || INSN0(15,4) == 0xFA0)  // SXTAH
+       && INSN1(15,12) == BITS4(1,1,1,1)
+       && INSN1(7,6) == BITS2(1,0)) {
+      Bool isU = INSN0(15,4) == 0xFA1;
+      UInt rN  = INSN0(3,0);
+      UInt rD  = INSN1(11,8);
+      UInt rM  = INSN1(3,0);
+      UInt rot = INSN1(5,4);
+      if (!isBadRegT(rD) && !isBadRegT(rN) && !isBadRegT(rM)) {
+         IRTemp srcL = newTemp(Ity_I32);
+         IRTemp srcR = newTemp(Ity_I32);
+         IRTemp res  = newTemp(Ity_I32);
+         assign(srcR, getIRegT(rM));
+         assign(srcL, getIRegT(rN));
+         assign(res,  binop(Iop_Add32,
+                            mkexpr(srcL),
+                            unop(isU ? Iop_16Uto32 : Iop_16Sto32,
+                                 unop(Iop_32to16, 
+                                      genROR32(srcR, 8 * rot)))));
+         putIRegT(rD, mkexpr(res), condT);
+         DIP("%cxtah r%u, r%u, r%u, ror #%u\n",
+             isU ? 'u' : 's', rD, rN, rM, rot);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- (T1) SXTAB ------------------- */
+   /* ------------------- (T1) UXTAB ------------------- */
+   if ((INSN0(15,4) == 0xFA5      // UXTAB
+        || INSN0(15,4) == 0xFA4)  // SXTAB
+       && INSN1(15,12) == BITS4(1,1,1,1)
+       && INSN1(7,6) == BITS2(1,0)) {
+      Bool isU = INSN0(15,4) == 0xFA5;
+      UInt rN  = INSN0(3,0);
+      UInt rD  = INSN1(11,8);
+      UInt rM  = INSN1(3,0);
+      UInt rot = INSN1(5,4);
+      if (!isBadRegT(rD) && !isBadRegT(rN) && !isBadRegT(rM)) {
+         IRTemp srcL = newTemp(Ity_I32);
+         IRTemp srcR = newTemp(Ity_I32);
+         IRTemp res  = newTemp(Ity_I32);
+         assign(srcR, getIRegT(rM));
+         assign(srcL, getIRegT(rN));
+         assign(res,  binop(Iop_Add32,
+                            mkexpr(srcL),
+                            unop(isU ? Iop_8Uto32 : Iop_8Sto32,
+                                 unop(Iop_32to8, 
+                                      genROR32(srcR, 8 * rot)))));
+         putIRegT(rD, mkexpr(res), condT);
+         DIP("%cxtab r%u, r%u, r%u, ror #%u\n",
+             isU ? 'u' : 's', rD, rN, rM, rot);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- (T1) CLZ ------------------- */
+   if (INSN0(15,4) == 0xFAB
+       && INSN1(15,12) == BITS4(1,1,1,1)
+       && INSN1(7,4) == BITS4(1,0,0,0)) {
+      UInt rM1 = INSN0(3,0);
+      UInt rD  = INSN1(11,8);
+      UInt rM2 = INSN1(3,0);
+      if (!isBadRegT(rD) && !isBadRegT(rM1) && rM1 == rM2) {
+         IRTemp arg = newTemp(Ity_I32);
+         IRTemp res = newTemp(Ity_I32);
+         assign(arg, getIRegT(rM1));
+         assign(res, IRExpr_Mux0X(
+                        unop(Iop_1Uto8,binop(Iop_CmpEQ32,
+                                             mkexpr(arg),
+                                             mkU32(0))),
+                        unop(Iop_Clz32, mkexpr(arg)),
+                        mkU32(32)
+         ));
+         putIRegT(rD, mkexpr(res), condT);
+         DIP("clz r%u, r%u\n", rD, rM1);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- (T1) RBIT ------------------- */
+   if (INSN0(15,4) == 0xFA9
+       && INSN1(15,12) == BITS4(1,1,1,1)
+       && INSN1(7,4) == BITS4(1,0,1,0)) {
+      UInt rM1 = INSN0(3,0);
+      UInt rD  = INSN1(11,8);
+      UInt rM2 = INSN1(3,0);
+      if (!isBadRegT(rD) && !isBadRegT(rM1) && rM1 == rM2) {
+         IRTemp arg = newTemp(Ity_I32);
+         assign(arg, getIRegT(rM1));
+         IRTemp res = gen_BITREV(arg);
+         putIRegT(rD, mkexpr(res), condT);
+         DIP("rbit r%u, r%u\n", rD, rM1);
+         goto decode_success;
+      }
+   }
+
+   /* ------------------- (T2) REV   ------------------- */
+   /* ------------------- (T2) REV16 ------------------- */
+   if (INSN0(15,4) == 0xFA9
+       && INSN1(15,12) == BITS4(1,1,1,1)
+       && (   INSN1(7,4) == BITS4(1,0,0,0)     // REV
+           || INSN1(7,4) == BITS4(1,0,0,1))) { // REV16
+      UInt rM1   = INSN0(3,0);
+      UInt rD    = INSN1(11,8);
+      UInt rM2   = INSN1(3,0);
+      Bool isREV = INSN1(7,4) == BITS4(1,0,0,0);
+      if (!isBadRegT(rD) && !isBadRegT(rM1) && rM1 == rM2) {
+         IRTemp arg = newTemp(Ity_I32);
+         assign(arg, getIRegT(rM1));
+         IRTemp res = isREV ? gen_REV(arg) : gen_REV16(arg);
+         putIRegT(rD, mkexpr(res), condT);
+         DIP("rev%s r%u, r%u\n", isREV ? "" : "16", rD, rM1);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T1) MSR apsr, reg -------------- */
+   if (INSN0(15,4) == 0xF38 
+       && INSN1(15,12) == BITS4(1,0,0,0) && INSN1(9,0) == 0x000) {
+      UInt rN          = INSN0(3,0);
+      UInt write_ge    = INSN1(10,10);
+      UInt write_nzcvq = INSN1(11,11);
+      if (!isBadRegT(rN) && (write_nzcvq || write_ge)) {
+         IRTemp rNt = newTemp(Ity_I32);
+         assign(rNt, getIRegT(rN));
+         desynthesise_APSR( write_nzcvq, write_ge, rNt, condT );
+         DIP("msr cpsr_%s%s, r%u\n",
+             write_nzcvq ? "f" : "", write_ge ? "g" : "", rN);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- (T1) MRS reg, apsr -------------- */
+   if (INSN0(15,0) == 0xF3EF
+       && INSN1(15,12) == BITS4(1,0,0,0) && INSN1(7,0) == 0x00) {
+      UInt rD = INSN1(11,8);
+      if (!isBadRegT(rD)) {
+         IRTemp apsr = synthesise_APSR();
+         putIRegT( rD, mkexpr(apsr), condT );
+         DIP("mrs r%u, cpsr\n", rD);
+         goto decode_success;
+      }
+   }
+
+   /* ----------------- (T1) LDREX ----------------- */
+   if (INSN0(15,4) == 0xE85 && INSN1(11,8) == BITS4(1,1,1,1)) {
+      UInt rN   = INSN0(3,0);
+      UInt rT   = INSN1(15,12);
+      UInt imm8 = INSN1(7,0);
+      if (!isBadRegT(rT) && rN != 15) {
+         IRTemp res;
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false( condT );
+         // now uncond
+         res = newTemp(Ity_I32);
+         stmt( IRStmt_LLSC(Iend_LE,
+                           res,
+                           binop(Iop_Add32, getIRegT(rN), mkU32(imm8 * 4)),
+                           NULL/*this is a load*/ ));
+         putIRegT(rT, mkexpr(res), IRTemp_INVALID);
+         DIP("ldrex r%u, [r%u, #+%u]\n", rT, rN, imm8 * 4);
+         goto decode_success;
+      }
+   }
+
+   /* ----------------- (T1) STREX ----------------- */
+   if (INSN0(15,4) == 0xE84) {
+      UInt rN   = INSN0(3,0);
+      UInt rT   = INSN1(15,12);
+      UInt rD   = INSN1(11,8);
+      UInt imm8 = INSN1(7,0);
+      if (!isBadRegT(rD) && !isBadRegT(rT) && rN != 15 
+          && rD != rN && rD != rT) {
+         IRTemp resSC1, resSC32;
+
+         // go uncond
+         mk_skip_over_T32_if_cond_is_false( condT );
+         // now uncond
+
+         /* Ok, now we're unconditional.  Do the store. */
+         resSC1 = newTemp(Ity_I1);
+         stmt( IRStmt_LLSC(Iend_LE,
+                           resSC1,
+                           binop(Iop_Add32, getIRegT(rN), mkU32(imm8 * 4)),
+                           getIRegT(rT)) );
+
+         /* Set rD to 1 on failure, 0 on success.  Currently we have
+            resSC1 == 0 on failure, 1 on success. */
+         resSC32 = newTemp(Ity_I32);
+         assign(resSC32,
+                unop(Iop_1Uto32, unop(Iop_Not1, mkexpr(resSC1))));
+
+         putIRegT(rD, mkexpr(resSC32), IRTemp_INVALID);
+         DIP("strex r%u, r%u, [r%u, #+%u]\n", rD, rT, rN, imm8 * 4);
+         goto decode_success;
+      }
+   }
+
+   /* -------------- v7 barrier insns -------------- */
+   if (INSN0(15,0) == 0xF3BF && (INSN1(15,0) & 0xFF0F) == 0x8F0F) {
+      /* XXX this isn't really right, is it?  The generated IR does
+         them unconditionally.  I guess it doesn't matter since it
+         doesn't do any harm to do them even when the guarding
+         condition is false -- it's just a performance loss. */
+      switch (INSN1(7,4)) {
+         case 0x4: /* DSB */
+            stmt( IRStmt_MBE(Imbe_Fence) );
+            DIP("DSB\n");
+            goto decode_success;
+         case 0x5: /* DMB */
+            stmt( IRStmt_MBE(Imbe_Fence) );
+            DIP("DMB\n");
+            goto decode_success;
+         case 0x6: /* ISB */
+            stmt( IRStmt_MBE(Imbe_Fence) );
+            DIP("ISB\n");
+            goto decode_success;
+         default:
+            break;
+      }
+   }
+
+   /* ------------------- NOP ------------------ */
+   if (INSN0(15,0) == 0xF3AF && INSN1(15,0) == 0x8000) {
+      DIP("nop\n");
+      goto decode_success;
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- VFP (CP 10, CP 11) instructions (in Thumb mode)       -- */
+   /* ----------------------------------------------------------- */
+
+   if (INSN0(15,12) == BITS4(1,1,1,0)) {
+      UInt insn28 = (INSN0(11,0) << 16) | INSN1(15,0);
+      Bool ok_vfp = decode_CP10_CP11_instruction (
+                       &dres, insn28, condT, ARMCondAL/*bogus*/,
+                       True/*isT*/
+                    );
+      if (ok_vfp)
+         goto decode_success;
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- NEON instructions (in Thumb mode)                     -- */
+   /* ----------------------------------------------------------- */
+
+   if (archinfo->hwcaps & VEX_HWCAPS_ARM_NEON) {
+      UInt insn32 = (INSN0(15,0) << 16) | INSN1(15,0);
+      Bool ok_neon = decode_NEON_instruction(
+                        &dres, insn32, condT, True/*isT*/
+                     );
+      if (ok_neon)
+         goto decode_success;
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- v6 media instructions (in Thumb mode)                 -- */
+   /* ----------------------------------------------------------- */
+
+   { UInt insn32 = (INSN0(15,0) << 16) | INSN1(15,0);
+     Bool ok_v6m = decode_V6MEDIA_instruction(
+                      &dres, insn32, condT, ARMCondAL/*bogus*/,
+                      True/*isT*/
+                   );
+     if (ok_v6m)
+        goto decode_success;
+   }
+
+   /* ----------------------------------------------------------- */
+   /* -- Undecodable                                           -- */
+   /* ----------------------------------------------------------- */
+
+   goto decode_failure;
+   /*NOTREACHED*/
+
+  decode_failure:
+   /* All decode failures end up here. */
+   vex_printf("disInstr(thumb): unhandled instruction: "
+              "0x%04x 0x%04x\n", (UInt)insn0, (UInt)insn1);
+
+   /* Back up ITSTATE to the initial value for this instruction.
+      If we don't do that, any subsequent restart of the instruction
+      will restart with the wrong value. */
+   put_ITSTATE(old_itstate);
+   /* Tell the dispatcher that this insn cannot be decoded, and so has
+      not been executed, and (is currently) the next to be executed.
+      R15 should be up-to-date since it made so at the start of each
+      insn, but nevertheless be paranoid and update it again right
+      now. */
+   vassert(0 == (guest_R15_curr_instr_notENC & 1));
+   llPutIReg( 15, mkU32(guest_R15_curr_instr_notENC | 1) );
+   irsb->next     = mkU32(guest_R15_curr_instr_notENC | 1 /* CPSR.T */);
+   irsb->jumpkind = Ijk_NoDecode;
+   dres.whatNext  = Dis_StopHere;
+   dres.len       = 0;
+   return dres;
+
+  decode_success:
+   /* All decode successes end up here. */
+   DIP("\n");
+
+   vassert(dres.len == 2 || dres.len == 4 || dres.len == 20);
+
+#if 0
+   // XXX is this necessary on Thumb?
+   /* Now then.  Do we have an implicit jump to r15 to deal with? */
+   if (r15written) {
+      /* If we get jump to deal with, we assume that there's been no
+         other competing branch stuff previously generated for this
+         insn.  That's reasonable, in the sense that the ARM insn set
+         appears to declare as "Unpredictable" any instruction which
+         generates more than one possible new value for r15.  Hence
+         just assert.  The decoders themselves should check against
+         all such instructions which are thusly Unpredictable, and
+         decline to decode them.  Hence we should never get here if we
+         have competing new values for r15, and hence it is safe to
+         assert here. */
+      vassert(dres.whatNext == Dis_Continue);
+      vassert(irsb->next == NULL);
+      vassert(irsb->jumpkind = Ijk_Boring);
+      /* If r15 is unconditionally written, terminate the block by
+         jumping to it.  If it's conditionally written, still
+         terminate the block (a shame, but we can't do side exits to
+         arbitrary destinations), but first jump to the next
+         instruction if the condition doesn't hold. */
+      /* We can't use getIRegT(15) to get the destination, since that
+         will produce r15+4, which isn't what we want.  Must use
+         llGetIReg(15) instead. */
+      if (r15guard == IRTemp_INVALID) {
+         /* unconditional */
+      } else {
+         /* conditional */
+         stmt( IRStmt_Exit(
+                  unop(Iop_32to1,
+                       binop(Iop_Xor32,
+                             mkexpr(r15guard), mkU32(1))),
+                  r15kind,
+                  IRConst_U32(guest_R15_curr_instr_notENC + 4)
+         ));
+      }
+      irsb->next     = llGetIReg(15);
+      irsb->jumpkind = r15kind;
+      dres.whatNext  = Dis_StopHere;
+   }
+#endif
+
+   return dres;
+
+#  undef INSN0
+#  undef INSN1
+}
+
+#undef DIP
+#undef DIS
+
+
+/*------------------------------------------------------------*/
+/*--- Top-level fn                                         ---*/
+/*------------------------------------------------------------*/
+
+/* Disassemble a single instruction into IR.  The instruction
+   is located in host memory at &guest_code[delta]. */
+
+DisResult disInstr_ARM ( IRSB*        irsb_IN,
+                         Bool         put_IP,
+                         Bool         (*resteerOkFn) ( void*, Addr64 ),
+                         Bool         resteerCisOk,
+                         void*        callback_opaque,
+                         UChar*       guest_code_IN,
+                         Long         delta_ENCODED,
+                         Addr64       guest_IP_ENCODED,
+                         VexArch      guest_arch,
+                         VexArchInfo* archinfo,
+                         VexAbiInfo*  abiinfo,
+                         Bool         host_bigendian_IN )
+{
+   DisResult dres;
+   Bool isThumb = (Bool)(guest_IP_ENCODED & 1);
+
+   /* Set globals (see top of this file) */
+   vassert(guest_arch == VexArchARM);
+
+   irsb              = irsb_IN;
+   host_is_bigendian = host_bigendian_IN;
+   __curr_is_Thumb   = isThumb;
+
+   if (isThumb) {
+      guest_R15_curr_instr_notENC = (Addr32)guest_IP_ENCODED - 1;
+   } else {
+      guest_R15_curr_instr_notENC = (Addr32)guest_IP_ENCODED;
+   }
+
+   if (isThumb) {
+      dres = disInstr_THUMB_WRK ( put_IP, resteerOkFn,
+                                  resteerCisOk, callback_opaque,
+                                  &guest_code_IN[delta_ENCODED - 1],
+                                  archinfo, abiinfo );
+   } else {
+      dres = disInstr_ARM_WRK ( put_IP, resteerOkFn,
+                                resteerCisOk, callback_opaque,
+                                &guest_code_IN[delta_ENCODED],
+                                archinfo, abiinfo );
+   }
+
+   return dres;
+}
+
+/* Test program for the conversion of IRCmpF64Result values to VFP
+   nzcv values.  See handling of FCMPD et al above. */
+/*
+UInt foo ( UInt x )
+{
+   UInt ix    = ((x >> 5) & 3) | (x & 1);
+   UInt termL = (((((ix ^ 1) << 30) - 1) >> 29) + 1);
+   UInt termR = (ix & (ix >> 1) & 1);
+   return termL  -  termR;
+}
+
+void try ( char* s, UInt ir, UInt req )
+{
+   UInt act = foo(ir);
+   printf("%s 0x%02x -> req %d%d%d%d act %d%d%d%d (0x%x)\n",
+          s, ir, (req >> 3) & 1, (req >> 2) & 1, 
+                 (req >> 1) & 1, (req >> 0) & 1, 
+                 (act >> 3) & 1, (act >> 2) & 1, 
+                 (act >> 1) & 1, (act >> 0) & 1, act);
+
+}
+
+int main ( void )
+{
+   printf("\n");
+   try("UN", 0x45, 0b0011);
+   try("LT", 0x01, 0b1000);
+   try("GT", 0x00, 0b0010);
+   try("EQ", 0x40, 0b0110);
+   printf("\n");
+   return 0;
+}
+*/
+
+/*--------------------------------------------------------------------*/
+/*--- end                                         guest_arm_toIR.c ---*/
+/*--------------------------------------------------------------------*/

diff --git a/VEX/priv/guest_generic_bb_to_IR.c b/VEX/priv/guest_generic_bb_to_IR.c
new file mode 100644
index 0000000..f7dc020
--- /dev/null
+++ b/VEX/priv/guest_generic_bb_to_IR.c

@@ -0,0 +1,822 @@
+
+/*--------------------------------------------------------------------*/
+/*--- begin                               guest_generic_bb_to_IR.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+#include "main_util.h"
+#include "main_globals.h"
+#include "guest_generic_bb_to_IR.h"
+
+
+/* Forwards .. */
+__attribute__((regparm(2)))
+static UInt genericg_compute_checksum_4al ( HWord first_w32, HWord n_w32s );
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_1 ( HWord first_w32 );
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_2 ( HWord first_w32 );
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_3 ( HWord first_w32 );
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_4 ( HWord first_w32 );
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_5 ( HWord first_w32 );
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_6 ( HWord first_w32 );
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_7 ( HWord first_w32 );
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_8 ( HWord first_w32 );
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_9 ( HWord first_w32 );
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_10 ( HWord first_w32 );
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_11 ( HWord first_w32 );
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_12 ( HWord first_w32 );
+
+/* Small helpers */
+static Bool const_False ( void* callback_opaque, Addr64 a ) { 
+   return False; 
+}
+
+/* Disassemble a complete basic block, starting at guest_IP_start, 
+   returning a new IRSB.  The disassembler may chase across basic
+   block boundaries if it wishes and if chase_into_ok allows it.
+   The precise guest address ranges from which code has been taken
+   are written into vge.  guest_IP_bbstart is taken to be the IP in
+   the guest's address space corresponding to the instruction at
+   &guest_code[0].  
+
+   dis_instr_fn is the arch-specific fn to disassemble on function; it
+   is this that does the real work.
+
+   do_self_check indicates that the caller needs a self-checking
+   translation.
+
+   preamble_function is a callback which allows the caller to add
+   its own IR preamble (following the self-check, if any).  May be
+   NULL.  If non-NULL, the IRSB under construction is handed to 
+   this function, which presumably adds IR statements to it.  The
+   callback may optionally complete the block and direct bb_to_IR
+   not to disassemble any instructions into it; this is indicated
+   by the callback returning True.
+
+   offB_TIADDR and offB_TILEN are the offsets of guest_TIADDR and
+   guest_TILEN.  Since this routine has to work for any guest state,
+   without knowing what it is, those offsets have to passed in.
+
+   callback_opaque is a caller-supplied pointer to data which the
+   callbacks may want to see.  Vex has no idea what it is.
+   (In fact it's a VgInstrumentClosure.)
+*/
+
+IRSB* bb_to_IR ( /*OUT*/VexGuestExtents* vge,
+                 /*IN*/ void*            callback_opaque,
+                 /*IN*/ DisOneInstrFn    dis_instr_fn,
+                 /*IN*/ UChar*           guest_code,
+                 /*IN*/ Addr64           guest_IP_bbstart,
+                 /*IN*/ Bool             (*chase_into_ok)(void*,Addr64),
+                 /*IN*/ Bool             host_bigendian,
+                 /*IN*/ VexArch          arch_guest,
+                 /*IN*/ VexArchInfo*     archinfo_guest,
+                 /*IN*/ VexAbiInfo*      abiinfo_both,
+                 /*IN*/ IRType           guest_word_type,
+                 /*IN*/ Bool             do_self_check,
+                 /*IN*/ Bool             (*preamble_function)(void*,IRSB*),
+                 /*IN*/ Int              offB_TISTART,
+                 /*IN*/ Int              offB_TILEN )
+{
+   Long       delta;
+   Int        i, n_instrs, first_stmt_idx;
+   Bool       resteerOK, need_to_put_IP, debug_print;
+   DisResult  dres;
+   IRStmt*    imark;
+   static Int n_resteers = 0;
+   Int        d_resteers = 0;
+   Int        selfcheck_idx = 0;
+   IRSB*      irsb;
+   Addr64     guest_IP_curr_instr;
+   IRConst*   guest_IP_bbstart_IRConst = NULL;
+   Int        n_cond_resteers_allowed = 2;
+
+   Bool (*resteerOKfn)(void*,Addr64) = NULL;
+
+   debug_print = toBool(vex_traceflags & VEX_TRACE_FE);
+
+   /* Note: for adler32 to work without % operation for the self
+      check, need to limit length of stuff it scans to 5552 bytes.
+      Therefore limiting the max bb len to 100 insns seems generously
+      conservative. */
+
+   /* check sanity .. */
+   vassert(sizeof(HWord) == sizeof(void*));
+   vassert(vex_control.guest_max_insns >= 1);
+   vassert(vex_control.guest_max_insns < 100);
+   vassert(vex_control.guest_chase_thresh >= 0);
+   vassert(vex_control.guest_chase_thresh < vex_control.guest_max_insns);
+   vassert(guest_word_type == Ity_I32 || guest_word_type == Ity_I64);
+
+   /* Start a new, empty extent. */
+   vge->n_used  = 1;
+   vge->base[0] = guest_IP_bbstart;
+   vge->len[0]  = 0;
+
+   /* And a new IR superblock to dump the result into. */
+   irsb = emptyIRSB();
+
+   /* Delta keeps track of how far along the guest_code array we have
+      so far gone. */
+   delta    = 0;
+   n_instrs = 0;
+
+   /* Guest addresses as IRConsts.  Used in the two self-checks
+      generated. */
+   if (do_self_check) {
+      guest_IP_bbstart_IRConst
+         = guest_word_type==Ity_I32 
+              ? IRConst_U32(toUInt(guest_IP_bbstart))
+              : IRConst_U64(guest_IP_bbstart);
+   }
+
+   /* If asked to make a self-checking translation, leave 5 spaces
+      in which to put the check statements.  We'll fill them in later
+      when we know the length and adler32 of the area to check. */
+   if (do_self_check) {
+      selfcheck_idx = irsb->stmts_used;
+      addStmtToIRSB( irsb, IRStmt_NoOp() );
+      addStmtToIRSB( irsb, IRStmt_NoOp() );
+      addStmtToIRSB( irsb, IRStmt_NoOp() );
+      addStmtToIRSB( irsb, IRStmt_NoOp() );
+      addStmtToIRSB( irsb, IRStmt_NoOp() );
+   }
+
+   /* If the caller supplied a function to add its own preamble, use
+      it now. */
+   if (preamble_function) {
+      Bool stopNow = preamble_function( callback_opaque, irsb );
+      if (stopNow) {
+         /* The callback has completed the IR block without any guest
+            insns being disassembled into it, so just return it at
+            this point, even if a self-check was requested - as there
+            is nothing to self-check.  The five self-check no-ops will
+            still be in place, but they are harmless. */
+         return irsb;
+      }
+   }
+
+   /* Process instructions. */
+   while (True) {
+      vassert(n_instrs < vex_control.guest_max_insns);
+
+      /* Regardless of what chase_into_ok says, is chasing permissible
+         at all right now?  Set resteerOKfn accordingly. */
+      resteerOK 
+         = toBool(
+              n_instrs < vex_control.guest_chase_thresh
+              /* If making self-checking translations, don't chase
+                 .. it makes the checks too complicated.  We only want
+                 to scan just one sequence of bytes in the check, not
+                 a whole bunch. */
+              && !do_self_check
+              /* we can't afford to have a resteer once we're on the
+                 last extent slot. */
+              && vge->n_used < 3
+           );
+
+      resteerOKfn
+         = resteerOK ? chase_into_ok : const_False;
+
+      /* n_cond_resteers_allowed keeps track of whether we're still
+         allowing dis_instr_fn to chase conditional branches.  It
+         starts (at 2) and gets decremented each time dis_instr_fn
+         tells us it has chased a conditional branch.  We then
+         decrement it, and use it to tell later calls to dis_instr_fn
+         whether or not it is allowed to chase conditional
+         branches. */
+      vassert(n_cond_resteers_allowed >= 0 && n_cond_resteers_allowed <= 2);
+
+      /* This is the IP of the instruction we're just about to deal
+         with. */
+      guest_IP_curr_instr = guest_IP_bbstart + delta;
+
+      /* This is the irsb statement array index of the first stmt in
+         this insn.  That will always be the instruction-mark
+         descriptor. */
+      first_stmt_idx = irsb->stmts_used;
+
+      /* Add an instruction-mark statement.  We won't know until after
+         disassembling the instruction how long it instruction is, so
+         just put in a zero length and we'll fix it up later. */
+      addStmtToIRSB( irsb, IRStmt_IMark( guest_IP_curr_instr, 0 ));
+
+      /* for the first insn, the dispatch loop will have set
+         %IP, but for all the others we have to do it ourselves. */
+      need_to_put_IP = toBool(n_instrs > 0);
+
+      /* Finally, actually disassemble an instruction. */
+      dres = dis_instr_fn ( irsb,
+                            need_to_put_IP,
+                            resteerOKfn,
+                            toBool(n_cond_resteers_allowed > 0),
+                            callback_opaque,
+                            guest_code,
+                            delta,
+                            guest_IP_curr_instr,
+                            arch_guest,
+                            archinfo_guest,
+                            abiinfo_both,
+                            host_bigendian );
+
+      /* stay sane ... */
+      vassert(dres.whatNext == Dis_StopHere
+              || dres.whatNext == Dis_Continue
+              || dres.whatNext == Dis_ResteerU
+              || dres.whatNext == Dis_ResteerC);
+      /* ... disassembled insn length is sane ... */
+      vassert(dres.len >= 0 && dres.len <= 20);
+      /* ... continueAt is zero if no resteer requested ... */
+      if (dres.whatNext != Dis_ResteerU && dres.whatNext != Dis_ResteerC)
+         vassert(dres.continueAt == 0);
+      /* ... if we disallowed conditional resteers, check that one
+             didn't actually happen anyway ... */
+      if (n_cond_resteers_allowed == 0)
+         vassert(dres.whatNext != Dis_ResteerC);
+
+      /* Fill in the insn-mark length field. */
+      vassert(first_stmt_idx >= 0 && first_stmt_idx < irsb->stmts_used);
+      imark = irsb->stmts[first_stmt_idx];
+      vassert(imark);
+      vassert(imark->tag == Ist_IMark);
+      vassert(imark->Ist.IMark.len == 0);
+      imark->Ist.IMark.len = toUInt(dres.len);
+
+      /* Print the resulting IR, if needed. */
+      if (vex_traceflags & VEX_TRACE_FE) {
+         for (i = first_stmt_idx; i < irsb->stmts_used; i++) {
+            vex_printf("              ");
+            ppIRStmt(irsb->stmts[i]);
+            vex_printf("\n");
+         }
+      }
+
+      /* If dis_instr_fn terminated the BB at this point, check it
+         also filled in the irsb->next field. */
+      if (dres.whatNext == Dis_StopHere) {
+         vassert(irsb->next != NULL);
+         if (debug_print) {
+            vex_printf("              ");
+            vex_printf( "goto {");
+            ppIRJumpKind(irsb->jumpkind);
+            vex_printf( "} ");
+            ppIRExpr( irsb->next );
+            vex_printf( "\n");
+         }
+      }
+
+      /* Update the VexGuestExtents we are constructing. */
+      /* If vex_control.guest_max_insns is required to be < 100 and
+         each insn is at max 20 bytes long, this limit of 5000 then
+         seems reasonable since the max possible extent length will be
+         100 * 20 == 2000. */
+      vassert(vge->len[vge->n_used-1] < 5000);
+      vge->len[vge->n_used-1] 
+         = toUShort(toUInt( vge->len[vge->n_used-1] + dres.len ));
+      n_instrs++;
+      if (debug_print) 
+         vex_printf("\n");
+
+      /* Advance delta (inconspicuous but very important :-) */
+      delta += (Long)dres.len;
+
+      switch (dres.whatNext) {
+         case Dis_Continue:
+            vassert(irsb->next == NULL);
+            if (n_instrs < vex_control.guest_max_insns) {
+               /* keep going */
+            } else {
+               /* We have to stop. */
+               irsb->next 
+                  = IRExpr_Const(
+                       guest_word_type == Ity_I32
+                          ? IRConst_U32(toUInt(guest_IP_bbstart+delta))
+                          : IRConst_U64(guest_IP_bbstart+delta)
+                    );
+               goto done;
+            }
+            break;
+         case Dis_StopHere:
+            vassert(irsb->next != NULL);
+            goto done;
+         case Dis_ResteerU:
+         case Dis_ResteerC:
+            /* Check that we actually allowed a resteer .. */
+            vassert(resteerOK);
+            vassert(irsb->next == NULL);
+            if (dres.whatNext == Dis_ResteerC) {
+               vassert(n_cond_resteers_allowed > 0);
+               n_cond_resteers_allowed--;
+            }
+            /* figure out a new delta to continue at. */
+            vassert(resteerOKfn(callback_opaque,dres.continueAt));
+            delta = dres.continueAt - guest_IP_bbstart;
+            /* we now have to start a new extent slot. */
+            vge->n_used++;
+            vassert(vge->n_used <= 3);
+            vge->base[vge->n_used-1] = dres.continueAt;
+            vge->len[vge->n_used-1] = 0;
+            n_resteers++;
+            d_resteers++;
+            if (0 && (n_resteers & 0xFF) == 0)
+            vex_printf("resteer[%d,%d] to 0x%llx (delta = %lld)\n",
+                       n_resteers, d_resteers,
+                       dres.continueAt, delta);
+            break;
+         default:
+            vpanic("bb_to_IR");
+      }
+   }
+   /*NOTREACHED*/
+   vassert(0);
+
+  done:
+   /* We're done.  The only thing that might need attending to is that
+      a self-checking preamble may need to be created.
+
+      The scheme is to compute a rather crude checksum of the code
+      we're making a translation of, and add to the IR a call to a
+      helper routine which recomputes the checksum every time the
+      translation is run, and requests a retranslation if it doesn't
+      match.  This is obviously very expensive and considerable
+      efforts are made to speed it up:
+
+      * the checksum is computed from all the 32-bit words that
+        overlap the translated code.  That means it could depend on up
+        to 3 bytes before and 3 bytes after which aren't part of the
+        translated area, and so if those change then we'll
+        unnecessarily have to discard and retranslate.  This seems
+        like a pretty remote possibility and it seems as if the
+        benefit of not having to deal with the ends of the range at
+        byte precision far outweigh any possible extra translations
+        needed.
+
+      * there's a generic routine and 12 specialised cases, which
+        handle the cases of 1 through 12-word lengths respectively.
+        They seem to cover about 90% of the cases that occur in
+        practice.
+   */
+   if (do_self_check) {
+
+      UInt     len2check, expected32;
+      IRTemp   tistart_tmp, tilen_tmp;
+      UInt     (*fn_generic)(HWord, HWord) __attribute__((regparm(2)));
+      UInt     (*fn_spec)(HWord) __attribute__((regparm(1)));
+      HChar*   nm_generic;
+      HChar*   nm_spec;
+      HWord    fn_generic_entry = 0;
+      HWord    fn_spec_entry = 0;
+
+      vassert(vge->n_used == 1);
+      len2check = vge->len[0];
+
+      /* stay sane */
+      vassert(len2check >= 0 && len2check < 1000/*arbitrary*/);
+
+      /* Skip the check if the translation involved zero bytes */
+      if (len2check > 0) {
+         HWord first_w32 = ((HWord)guest_code) & ~(HWord)3;
+         HWord last_w32  = (((HWord)guest_code) + len2check - 1) & ~(HWord)3;
+         vassert(first_w32 <= last_w32);
+         HWord w32_diff = last_w32 - first_w32;
+         vassert(0 == (w32_diff & 3));
+         HWord w32s_to_check = (w32_diff + 4) / 4;
+         vassert(w32s_to_check > 0 && w32s_to_check < 1004/*arbitrary*//4);
+
+         /* vex_printf("%lx %lx  %ld\n", first_w32, last_w32, w32s_to_check); */
+
+         fn_generic =  genericg_compute_checksum_4al;
+         nm_generic = "genericg_compute_checksum_4al";
+         fn_spec = NULL;
+         nm_spec = NULL;
+
+         switch (w32s_to_check) {
+             case 1:  fn_spec =  genericg_compute_checksum_4al_1;
+                      nm_spec = "genericg_compute_checksum_4al_1"; break;
+             case 2:  fn_spec =  genericg_compute_checksum_4al_2;
+                      nm_spec = "genericg_compute_checksum_4al_2"; break;
+             case 3:  fn_spec =  genericg_compute_checksum_4al_3;
+                      nm_spec = "genericg_compute_checksum_4al_3"; break;
+             case 4:  fn_spec =  genericg_compute_checksum_4al_4;
+                      nm_spec = "genericg_compute_checksum_4al_4"; break;
+             case 5:  fn_spec =  genericg_compute_checksum_4al_5;
+                      nm_spec = "genericg_compute_checksum_4al_5"; break;
+             case 6:  fn_spec =  genericg_compute_checksum_4al_6;
+                      nm_spec = "genericg_compute_checksum_4al_6"; break;
+             case 7:  fn_spec =  genericg_compute_checksum_4al_7;
+                      nm_spec = "genericg_compute_checksum_4al_7"; break;
+             case 8:  fn_spec =  genericg_compute_checksum_4al_8;
+                      nm_spec = "genericg_compute_checksum_4al_8"; break;
+             case 9:  fn_spec =  genericg_compute_checksum_4al_9;
+                      nm_spec = "genericg_compute_checksum_4al_9"; break;
+             case 10: fn_spec =  genericg_compute_checksum_4al_10;
+                      nm_spec = "genericg_compute_checksum_4al_10"; break;
+             case 11: fn_spec =  genericg_compute_checksum_4al_11;
+                      nm_spec = "genericg_compute_checksum_4al_11"; break;
+             case 12: fn_spec =  genericg_compute_checksum_4al_12;
+                      nm_spec = "genericg_compute_checksum_4al_12"; break;
+             default: break;
+         }
+
+         expected32 = fn_generic( first_w32, w32s_to_check );
+         /* If we got a specialised version, check it produces the same
+            result as the generic version! */
+         if (fn_spec) {
+            vassert(nm_spec);
+            vassert(expected32 == fn_spec( first_w32 ));
+         } else {
+            vassert(!nm_spec);
+         }
+
+         /* Set TISTART and TILEN.  These will describe to the despatcher
+            the area of guest code to invalidate should we exit with a
+            self-check failure. */
+
+         tistart_tmp = newIRTemp(irsb->tyenv, guest_word_type);
+         tilen_tmp   = newIRTemp(irsb->tyenv, guest_word_type);
+
+         irsb->stmts[selfcheck_idx+0]
+            = IRStmt_WrTmp(tistart_tmp, IRExpr_Const(guest_IP_bbstart_IRConst) );
+
+         irsb->stmts[selfcheck_idx+1]
+            = IRStmt_WrTmp(tilen_tmp,
+                           guest_word_type==Ity_I32 
+                              ? IRExpr_Const(IRConst_U32(len2check)) 
+                              : IRExpr_Const(IRConst_U64(len2check))
+              );
+
+         irsb->stmts[selfcheck_idx+2]
+            = IRStmt_Put( offB_TISTART, IRExpr_RdTmp(tistart_tmp) );
+
+         irsb->stmts[selfcheck_idx+3]
+            = IRStmt_Put( offB_TILEN, IRExpr_RdTmp(tilen_tmp) );
+
+         /* Generate the entry point descriptors */
+         if (abiinfo_both->host_ppc_calls_use_fndescrs) {
+            HWord* descr = (HWord*)fn_generic;
+            fn_generic_entry = descr[0];
+            if (fn_spec) {
+               descr = (HWord*)fn_spec;
+               fn_spec_entry = descr[0];
+            } else {
+               fn_spec_entry = (HWord)NULL;
+            }
+         } else {
+            fn_generic_entry = (HWord)fn_generic;
+            if (fn_spec) {
+               fn_spec_entry = (HWord)fn_spec;
+            } else {
+               fn_spec_entry = (HWord)NULL;
+            }
+         }
+
+         IRExpr* callexpr = NULL;
+         if (fn_spec) {
+            callexpr = mkIRExprCCall( 
+                          Ity_I32, 1/*regparms*/, 
+                          nm_spec, (void*)fn_spec_entry,
+                          mkIRExprVec_1(
+                             mkIRExpr_HWord( (HWord)first_w32 )
+                          )
+                       );
+         } else {
+            callexpr = mkIRExprCCall( 
+                          Ity_I32, 2/*regparms*/, 
+                          nm_generic, (void*)fn_generic_entry,
+                          mkIRExprVec_2(
+                             mkIRExpr_HWord( (HWord)first_w32 ),
+                             mkIRExpr_HWord( (HWord)w32s_to_check )
+                          )
+                       );
+         }
+
+         irsb->stmts[selfcheck_idx+4]
+            = IRStmt_Exit( 
+                 IRExpr_Binop( 
+                    Iop_CmpNE32,
+                    callexpr,
+                    IRExpr_Const(IRConst_U32(expected32))
+                 ),
+                 Ijk_TInval,
+                 guest_IP_bbstart_IRConst
+              );
+      }
+   }
+
+   return irsb;
+}
+
+
+/*-------------------------------------------------------------
+  A support routine for doing self-checking translations. 
+  -------------------------------------------------------------*/
+
+/* CLEAN HELPER */
+/* CALLED FROM GENERATED CODE */
+
+/* Compute a checksum of host memory at [addr .. addr+len-1], as fast
+   as possible.  The _4al_4plus version is assured that the request is
+   for 4-aligned memory and for a block of 4 or more long, whilst the
+   _generic version must be able to handle any alignment, and lengths
+   down to zero too.  This fn is called once for every use of a
+   self-checking translation, so it needs to be as fast as
+   possible. */
+
+static inline UInt ROL32 ( UInt w, Int n ) {
+   w = (w << n) | (w >> (32-n));
+   return w;
+}
+
+__attribute((regparm(2)))
+static UInt genericg_compute_checksum_4al ( HWord first_w32, HWord n_w32s )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   /* unrolled */
+   while (n_w32s >= 4) {
+      UInt  w;
+      w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+      w = p[1];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+      w = p[2];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+      w = p[3];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+      p += 4;
+      n_w32s -= 4;
+      sum1 ^= sum2;
+   }
+   while (n_w32s >= 1) {
+      UInt  w;
+      w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+      p += 1;
+      n_w32s -= 1;
+      sum1 ^= sum2;
+   }
+   return sum1 + sum2;
+}
+
+/* Specialised versions of the above function */
+
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_1 ( HWord first_w32 )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   UInt  w;
+   w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   return sum1 + sum2;
+}
+
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_2 ( HWord first_w32 )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   UInt  w;
+   w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[1];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   return sum1 + sum2;
+}
+
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_3 ( HWord first_w32 )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   UInt  w;
+   w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[1];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[2];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   return sum1 + sum2;
+}
+
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_4 ( HWord first_w32 )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   UInt  w;
+   w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[1];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[2];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[3];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   return sum1 + sum2;
+}
+
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_5 ( HWord first_w32 )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   UInt  w;
+   w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[1];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[2];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[3];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[4];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   return sum1 + sum2;
+}
+
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_6 ( HWord first_w32 )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   UInt  w;
+   w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[1];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[2];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[3];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[4];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[5];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   return sum1 + sum2;
+}
+
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_7 ( HWord first_w32 )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   UInt  w;
+   w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[1];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[2];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[3];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[4];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[5];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[6];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   return sum1 + sum2;
+}
+
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_8 ( HWord first_w32 )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   UInt  w;
+   w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[1];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[2];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[3];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[4];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[5];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[6];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[7];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   return sum1 + sum2;
+}
+
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_9 ( HWord first_w32 )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   UInt  w;
+   w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[1];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[2];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[3];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[4];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[5];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[6];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[7];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[8];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   return sum1 + sum2;
+}
+
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_10 ( HWord first_w32 )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   UInt  w;
+   w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[1];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[2];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[3];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[4];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[5];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[6];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[7];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[8];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[9];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   return sum1 + sum2;
+}
+
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_11 ( HWord first_w32 )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   UInt  w;
+   w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[1];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[2];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[3];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[4];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[5];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[6];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[7];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[8];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[9];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[10]; sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   return sum1 + sum2;
+}
+
+__attribute__((regparm(1)))
+static UInt genericg_compute_checksum_4al_12 ( HWord first_w32 )
+{
+   UInt  sum1 = 0, sum2 = 0;
+   UInt* p = (UInt*)first_w32;
+   UInt  w;
+   w = p[0];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[1];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[2];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[3];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[4];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[5];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[6];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[7];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   w = p[8];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[9];  sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[10]; sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   w = p[11]; sum1 = ROL32(sum1 ^ w, 31);  sum2 += w;
+   sum1 ^= sum2;
+   return sum1 + sum2;
+}
+
+/*--------------------------------------------------------------------*/
+/*--- end                                 guest_generic_bb_to_IR.c ---*/
+/*--------------------------------------------------------------------*/

diff --git a/VEX/priv/guest_generic_bb_to_IR.h b/VEX/priv/guest_generic_bb_to_IR.h
new file mode 100644
index 0000000..9ea10cb
--- /dev/null
+++ b/VEX/priv/guest_generic_bb_to_IR.h

@@ -0,0 +1,182 @@
+
+/*--------------------------------------------------------------------*/
+/*--- begin                               guest_generic_bb_to_IR.h ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#ifndef __VEX_GUEST_GENERIC_BB_TO_IR_H
+#define __VEX_GUEST_GENERIC_BB_TO_IR_H
+
+
+/* This defines stuff needed by the guest insn disassemblers.
+   It's a bit circular; is imported by
+   - the guest-specific toIR.c files (guest-{x86,amd64,ppc,arm}/toIR.c)
+   - the generic disassembly driver (bb_to_IR.c)
+   - vex_main.c
+*/
+
+
+/* ---------------------------------------------------------------
+   Result of disassembling an instruction
+   --------------------------------------------------------------- */
+
+/* The results of disassembling an instruction.  There are three
+   possible outcomes.  For Dis_Resteer, the disassembler _must_
+   continue at the specified address.  For Dis_StopHere, the
+   disassembler _must_ terminate the BB.  For Dis_Continue, we may at
+   our option either disassemble the next insn, or terminate the BB;
+   but in the latter case we must set the bb's ->next field to point
+   to the next instruction.  */
+
+typedef
+
+   struct {
+
+      /* The disassembled insn has this length.  Must always be
+         set. */
+      Int len;
+
+      /* What happens next?
+         Dis_StopHere:  this insn terminates the BB; we must stop.
+         Dis_Continue:  we can optionally continue into the next insn
+         Dis_ResteerU:  followed an unconditional branch; continue at 
+                        'continueAt'
+         Dis_ResteerC:  (speculatively, of course) followed a
+                        conditional branch; continue at 'continueAt'
+      */
+      enum { Dis_StopHere, Dis_Continue, 
+             Dis_ResteerU, Dis_ResteerC } whatNext;
+
+      /* For Dis_Resteer, this is the guest address we should continue
+         at.  Otherwise ignored (should be zero). */
+      Addr64 continueAt;
+
+   }
+
+   DisResult;
+
+
+/* ---------------------------------------------------------------
+   The type of a function which disassembles one instruction.
+   C's function-type syntax is really astonishing bizarre.
+   --------------------------------------------------------------- */
+
+/* A function of this type (DisOneInstrFn) disassembles an instruction
+   located at host address &guest_code[delta], whose guest IP is
+   guest_IP (this may be entirely unrelated to where the insn is
+   actually located in the host's address space.).  The returned
+   DisResult.len field carries its size.  If the returned
+   DisResult.whatNext field is Dis_Resteer then DisResult.continueAt
+   should hold the guest IP of the next insn to disassemble.
+
+   disInstr is not permitted to return Dis_Resteer if resteerOkFn,
+   when applied to the address which it wishes to resteer into,
+   returns False.  
+
+   The resulting IR is added to the end of irbb.
+*/
+
+typedef
+
+   DisResult (*DisOneInstrFn) ( 
+
+      /* This is the IRSB to which the resulting IR is to be appended. */
+      /*OUT*/ IRSB*        irbb,
+
+      /* Do we need to generate IR to set the guest IP for this insn,
+         or not? */
+      /*IN*/  Bool         put_IP,
+
+      /* Return True iff resteering to the given addr is allowed (for
+         branches/calls to destinations that are known at JIT-time) */
+      /*IN*/  Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
+
+      /* Should we speculatively resteer across conditional branches?
+         (Experimental and not enabled by default).  The strategy is
+         to assume that backward branches are taken and forward
+         branches are not taken. */
+      /*IN*/  Bool         resteerCisOk,
+
+      /* Vex-opaque data passed to all caller (valgrind) supplied
+         callbacks. */
+      /*IN*/  void*        callback_opaque,
+
+      /* Where is the guest code? */
+      /*IN*/  UChar*       guest_code,
+
+      /* Where is the actual insn?  Note: it's at &guest_code[delta] */
+      /*IN*/  Long         delta,
+
+      /* What is the guest IP of the insn? */
+      /*IN*/  Addr64       guest_IP,
+
+      /* Info about the guest architecture */
+      /*IN*/  VexArch      guest_arch,
+      /*IN*/  VexArchInfo* archinfo,
+
+      /* ABI info for both guest and host */
+      /*IN*/  VexAbiInfo*  abiinfo,
+
+      /* Is the host bigendian? */
+      /*IN*/  Bool         host_bigendian
+
+   );
+
+
+/* ---------------------------------------------------------------
+   Top-level BB to IR conversion fn.
+   --------------------------------------------------------------- */
+
+/* See detailed comment in bb_to_IR.c. */
+extern
+IRSB* bb_to_IR ( /*OUT*/VexGuestExtents* vge,
+                 /*IN*/ void*            closure_opaque,
+                 /*IN*/ DisOneInstrFn    dis_instr_fn,
+                 /*IN*/ UChar*           guest_code,
+                 /*IN*/ Addr64           guest_IP_bbstart,
+                 /*IN*/ Bool             (*chase_into_ok)(void*,Addr64),
+                 /*IN*/ Bool             host_bigendian,
+                 /*IN*/ VexArch          arch_guest,
+                 /*IN*/ VexArchInfo*     archinfo_guest,
+                 /*IN*/ VexAbiInfo*      abiinfo_both,
+                 /*IN*/ IRType           guest_word_type,
+                 /*IN*/ Bool             do_self_check,
+                 /*IN*/ Bool             (*preamble_function)(void*,IRSB*),
+                 /*IN*/ Int              offB_TISTART,
+                 /*IN*/ Int              offB_TILEN );
+
+
+#endif /* ndef __VEX_GUEST_GENERIC_BB_TO_IR_H */
+
+/*--------------------------------------------------------------------*/
+/*--- end                                 guest_generic_bb_to_IR.h ---*/
+/*--------------------------------------------------------------------*/

diff --git a/VEX/priv/guest_generic_x87.c b/VEX/priv/guest_generic_x87.c
new file mode 100644
index 0000000..4204893
--- /dev/null
+++ b/VEX/priv/guest_generic_x87.c

@@ -0,0 +1,888 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                               guest_generic_x87.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+/* This file contains functions for doing some x87-specific
+   operations.  Both the amd64 and x86 front ends (guests) indirectly
+   call these functions via guest helper calls.  By putting them here,
+   code duplication is avoided.  Some of these functions are tricky
+   and hard to verify, so there is much to be said for only having one
+   copy thereof.
+*/
+
+#include "libvex_basictypes.h"
+
+#include "main_util.h"
+#include "guest_generic_x87.h"
+
+
+/* 80 and 64-bit floating point formats:
+
+   80-bit:
+
+    S  0       0-------0      zero
+    S  0       0X------X      denormals
+    S  1-7FFE  1X------X      normals (all normals have leading 1)
+    S  7FFF    10------0      infinity
+    S  7FFF    10X-----X      snan
+    S  7FFF    11X-----X      qnan
+
+   S is the sign bit.  For runs X----X, at least one of the Xs must be
+   nonzero.  Exponent is 15 bits, fractional part is 63 bits, and
+   there is an explicitly represented leading 1, and a sign bit,
+   giving 80 in total.
+
+   64-bit avoids the confusion of an explicitly represented leading 1
+   and so is simpler:
+
+    S  0      0------0   zero
+    S  0      X------X   denormals
+    S  1-7FE  any        normals
+    S  7FF    0------0   infinity
+    S  7FF    0X-----X   snan
+    S  7FF    1X-----X   qnan
+
+   Exponent is 11 bits, fractional part is 52 bits, and there is a 
+   sign bit, giving 64 in total.
+*/
+
+
+static inline UInt read_bit_array ( UChar* arr, UInt n )
+{
+   UChar c = arr[n >> 3];
+   c >>= (n&7);
+   return c & 1;
+}
+
+static inline void write_bit_array ( UChar* arr, UInt n, UInt b )
+{
+   UChar c = arr[n >> 3];
+   c = toUChar( c & ~(1 << (n&7)) );
+   c = toUChar( c | ((b&1) << (n&7)) );
+   arr[n >> 3] = c;
+}
+
+/* Convert an IEEE754 double (64-bit) into an x87 extended double
+   (80-bit), mimicing the hardware fairly closely.  Both numbers are
+   stored little-endian.  Limitations, all of which could be fixed,
+   given some level of hassle:
+
+   * Identity of NaNs is not preserved.
+
+   See comments in the code for more details.
+*/
+void convert_f64le_to_f80le ( /*IN*/UChar* f64, /*OUT*/UChar* f80 )
+{
+   Bool  mantissaIsZero;
+   Int   bexp, i, j, shift;
+   UChar sign;
+
+   sign = toUChar( (f64[7] >> 7) & 1 );
+   bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
+   bexp &= 0x7FF;
+
+   mantissaIsZero = False;
+   if (bexp == 0 || bexp == 0x7FF) {
+      /* We'll need to know whether or not the mantissa (bits 51:0) is
+         all zeroes in order to handle these cases.  So figure it
+         out. */
+      mantissaIsZero
+         = toBool( 
+              (f64[6] & 0x0F) == 0 
+              && f64[5] == 0 && f64[4] == 0 && f64[3] == 0 
+              && f64[2] == 0 && f64[1] == 0 && f64[0] == 0
+           );
+   }
+
+   /* If the exponent is zero, either we have a zero or a denormal.
+      Produce a zero.  This is a hack in that it forces denormals to
+      zero.  Could do better. */
+   if (bexp == 0) {
+      f80[9] = toUChar( sign << 7 );
+      f80[8] = f80[7] = f80[6] = f80[5] = f80[4]
+             = f80[3] = f80[2] = f80[1] = f80[0] = 0;
+
+      if (mantissaIsZero)
+         /* It really is zero, so that's all we can do. */
+         return;
+
+      /* There is at least one 1-bit in the mantissa.  So it's a
+         potentially denormalised double -- but we can produce a
+         normalised long double.  Count the leading zeroes in the
+         mantissa so as to decide how much to bump the exponent down
+         by.  Note, this is SLOW. */
+      shift = 0;
+      for (i = 51; i >= 0; i--) {
+        if (read_bit_array(f64, i))
+           break;
+        shift++;
+      }
+
+      /* and copy into place as many bits as we can get our hands on. */
+      j = 63;
+      for (i = 51 - shift; i >= 0; i--) {
+         write_bit_array( f80, j,
+     	 read_bit_array( f64, i ) );
+         j--;
+      }
+
+      /* Set the exponent appropriately, and we're done. */
+      bexp -= shift;
+      bexp += (16383 - 1023);
+      f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) );
+      f80[8] = toUChar( bexp & 0xFF );
+      return;
+   }
+
+   /* If the exponent is 7FF, this is either an Infinity, a SNaN or
+      QNaN, as determined by examining bits 51:0, thus:
+          0  ... 0    Inf
+          0X ... X    SNaN
+          1X ... X    QNaN
+      where at least one of the Xs is not zero.
+   */
+   if (bexp == 0x7FF) {
+      if (mantissaIsZero) {
+         /* Produce an appropriately signed infinity:
+            S 1--1 (15)  1  0--0 (63)
+         */
+         f80[9] = toUChar( (sign << 7) | 0x7F );
+         f80[8] = 0xFF;
+         f80[7] = 0x80;
+         f80[6] = f80[5] = f80[4] = f80[3] 
+                = f80[2] = f80[1] = f80[0] = 0;
+         return;
+      }
+      /* So it's either a QNaN or SNaN.  Distinguish by considering
+         bit 51.  Note, this destroys all the trailing bits
+         (identity?) of the NaN.  IEEE754 doesn't require preserving
+         these (it only requires that there be one QNaN value and one
+         SNaN value), but x87 does seem to have some ability to
+         preserve them.  Anyway, here, the NaN's identity is
+         destroyed.  Could be improved. */
+      if (f64[6] & 8) {
+         /* QNaN.  Make a QNaN:
+            S 1--1 (15)  1  1--1 (63) 
+         */
+         f80[9] = toUChar( (sign << 7) | 0x7F );
+         f80[8] = 0xFF;
+         f80[7] = 0xFF;
+         f80[6] = f80[5] = f80[4] = f80[3] 
+                = f80[2] = f80[1] = f80[0] = 0xFF;
+      } else {
+         /* SNaN.  Make a SNaN:
+            S 1--1 (15)  0  1--1 (63) 
+         */
+         f80[9] = toUChar( (sign << 7) | 0x7F );
+         f80[8] = 0xFF;
+         f80[7] = 0x7F;
+         f80[6] = f80[5] = f80[4] = f80[3] 
+                = f80[2] = f80[1] = f80[0] = 0xFF;
+      }
+      return;
+   }
+
+   /* It's not a zero, denormal, infinity or nan.  So it must be a
+      normalised number.  Rebias the exponent and build the new
+      number.  */
+   bexp += (16383 - 1023);
+
+   f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) );
+   f80[8] = toUChar( bexp & 0xFF );
+   f80[7] = toUChar( (1 << 7) | ((f64[6] << 3) & 0x78) 
+                              | ((f64[5] >> 5) & 7) );
+   f80[6] = toUChar( ((f64[5] << 3) & 0xF8) | ((f64[4] >> 5) & 7) );
+   f80[5] = toUChar( ((f64[4] << 3) & 0xF8) | ((f64[3] >> 5) & 7) );
+   f80[4] = toUChar( ((f64[3] << 3) & 0xF8) | ((f64[2] >> 5) & 7) );
+   f80[3] = toUChar( ((f64[2] << 3) & 0xF8) | ((f64[1] >> 5) & 7) );
+   f80[2] = toUChar( ((f64[1] << 3) & 0xF8) | ((f64[0] >> 5) & 7) );
+   f80[1] = toUChar( ((f64[0] << 3) & 0xF8) );
+   f80[0] = toUChar( 0 );
+}
+
+
+/* Convert an x87 extended double (80-bit) into an IEEE 754 double
+   (64-bit), mimicking the hardware fairly closely.  Both numbers are
+   stored little-endian.  Limitations, both of which could be fixed,
+   given some level of hassle:
+
+   * Rounding following truncation could be a bit better.
+
+   * Identity of NaNs is not preserved.
+
+   See comments in the code for more details.
+*/
+void convert_f80le_to_f64le ( /*IN*/UChar* f80, /*OUT*/UChar* f64 )
+{
+   Bool  isInf;
+   Int   bexp, i, j;
+   UChar sign;
+
+   sign = toUChar((f80[9] >> 7) & 1);
+   bexp = (((UInt)f80[9]) << 8) | (UInt)f80[8];
+   bexp &= 0x7FFF;
+
+   /* If the exponent is zero, either we have a zero or a denormal.
+      But an extended precision denormal becomes a double precision
+      zero, so in either case, just produce the appropriately signed
+      zero. */
+   if (bexp == 0) {
+      f64[7] = toUChar(sign << 7);
+      f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
+      return;
+   }
+   
+   /* If the exponent is 7FFF, this is either an Infinity, a SNaN or
+      QNaN, as determined by examining bits 62:0, thus:
+          0  ... 0    Inf
+          0X ... X    SNaN
+          1X ... X    QNaN
+      where at least one of the Xs is not zero.
+   */
+   if (bexp == 0x7FFF) {
+      isInf = toBool(
+                 (f80[7] & 0x7F) == 0 
+                 && f80[6] == 0 && f80[5] == 0 && f80[4] == 0 
+                 && f80[3] == 0 && f80[2] == 0 && f80[1] == 0 
+                 && f80[0] == 0
+              );
+      if (isInf) {
+         if (0 == (f80[7] & 0x80))
+            goto wierd_NaN;
+         /* Produce an appropriately signed infinity:
+            S 1--1 (11)  0--0 (52)
+         */
+         f64[7] = toUChar((sign << 7) | 0x7F);
+         f64[6] = 0xF0;
+         f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
+         return;
+      }
+      /* So it's either a QNaN or SNaN.  Distinguish by considering
+         bit 62.  Note, this destroys all the trailing bits
+         (identity?) of the NaN.  IEEE754 doesn't require preserving
+         these (it only requires that there be one QNaN value and one
+         SNaN value), but x87 does seem to have some ability to
+         preserve them.  Anyway, here, the NaN's identity is
+         destroyed.  Could be improved. */
+      if (f80[8] & 0x40) {
+         /* QNaN.  Make a QNaN:
+            S 1--1 (11)  1  1--1 (51) 
+         */
+         f64[7] = toUChar((sign << 7) | 0x7F);
+         f64[6] = 0xFF;
+         f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;
+      } else {
+         /* SNaN.  Make a SNaN:
+            S 1--1 (11)  0  1--1 (51) 
+         */
+         f64[7] = toUChar((sign << 7) | 0x7F);
+         f64[6] = 0xF7;
+         f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;
+      }
+      return;
+   }
+
+   /* If it's not a Zero, NaN or Inf, and the integer part (bit 62) is
+      zero, the x87 FPU appears to consider the number denormalised
+      and converts it to a QNaN. */
+   if (0 == (f80[7] & 0x80)) {
+      wierd_NaN:
+      /* Strange hardware QNaN:
+         S 1--1 (11)  1  0--0 (51) 
+      */
+      /* On a PIII, these QNaNs always appear with sign==1.  I have
+         no idea why. */
+      f64[7] = (1 /*sign*/ << 7) | 0x7F;
+      f64[6] = 0xF8;
+      f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
+      return;
+   }
+
+   /* It's not a zero, denormal, infinity or nan.  So it must be a 
+      normalised number.  Rebias the exponent and consider. */
+   bexp -= (16383 - 1023);
+   if (bexp >= 0x7FF) {
+      /* It's too big for a double.  Construct an infinity. */
+      f64[7] = toUChar((sign << 7) | 0x7F);
+      f64[6] = 0xF0;
+      f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
+      return;
+   }
+
+   if (bexp <= 0) {
+      /* It's too small for a normalised double.  First construct a
+         zero and then see if it can be improved into a denormal.  */
+      f64[7] = toUChar(sign << 7);
+      f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
+
+      if (bexp < -52)
+         /* Too small even for a denormal. */
+         return;
+
+      /* Ok, let's make a denormal.  Note, this is SLOW. */
+      /* Copy bits 63, 62, 61, etc of the src mantissa into the dst, 
+         indexes 52+bexp, 51+bexp, etc, until k+bexp < 0. */
+      /* bexp is in range -52 .. 0 inclusive */
+      for (i = 63; i >= 0; i--) {
+         j = i - 12 + bexp;
+         if (j < 0) break;
+         /* We shouldn't really call vassert from generated code. */
+         vassert(j >= 0 && j < 52);
+         write_bit_array ( f64,
+                           j,
+                           read_bit_array ( f80, i ) );
+      }
+      /* and now we might have to round ... */
+      if (read_bit_array(f80, 10+1 - bexp) == 1) 
+         goto do_rounding;
+
+      return;
+   }
+
+   /* Ok, it's a normalised number which is representable as a double.
+      Copy the exponent and mantissa into place. */
+   /*
+   for (i = 0; i < 52; i++)
+      write_bit_array ( f64,
+                        i,
+                        read_bit_array ( f80, i+11 ) );
+   */
+   f64[0] = toUChar( (f80[1] >> 3) | (f80[2] << 5) );
+   f64[1] = toUChar( (f80[2] >> 3) | (f80[3] << 5) );
+   f64[2] = toUChar( (f80[3] >> 3) | (f80[4] << 5) );
+   f64[3] = toUChar( (f80[4] >> 3) | (f80[5] << 5) );
+   f64[4] = toUChar( (f80[5] >> 3) | (f80[6] << 5) );
+   f64[5] = toUChar( (f80[6] >> 3) | (f80[7] << 5) );
+
+   f64[6] = toUChar( ((bexp << 4) & 0xF0) | ((f80[7] >> 3) & 0x0F) );
+
+   f64[7] = toUChar( (sign << 7) | ((bexp >> 4) & 0x7F) );
+
+   /* Now consider any rounding that needs to happen as a result of
+      truncating the mantissa. */
+   if (f80[1] & 4) /* read_bit_array(f80, 10) == 1) */ {
+
+      /* If the bottom bits of f80 are "100 0000 0000", then the
+         infinitely precise value is deemed to be mid-way between the
+         two closest representable values.  Since we're doing
+         round-to-nearest (the default mode), in that case it is the
+         bit immediately above which indicates whether we should round
+         upwards or not -- if 0, we don't.  All that is encapsulated
+         in the following simple test. */
+      if ((f80[1] & 0xF) == 4/*0100b*/ && f80[0] == 0)
+         return;
+
+      do_rounding:
+      /* Round upwards.  This is a kludge.  Once in every 2^24
+         roundings (statistically) the bottom three bytes are all 0xFF
+         and so we don't round at all.  Could be improved. */
+      if (f64[0] != 0xFF) { 
+         f64[0]++; 
+      }
+      else 
+      if (f64[0] == 0xFF && f64[1] != 0xFF) {
+         f64[0] = 0;
+         f64[1]++;
+      }
+      else      
+      if (f64[0] == 0xFF && f64[1] == 0xFF && f64[2] != 0xFF) {
+         f64[0] = 0;
+         f64[1] = 0;
+         f64[2]++;
+      }
+      /* else we don't round, but we should. */
+   }
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Extract the signed significand or exponent component as per
+   fxtract.  Arg and result are doubles travelling under the guise of
+   ULongs.  Returns significand when getExp is zero and exponent
+   otherwise. */
+ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp )
+{
+   ULong  uSig, uExp;
+   /* Long   sSig; */
+   Int    sExp, i;
+   UInt   sign, expExp;
+
+   /*
+    S  7FF    0------0   infinity
+    S  7FF    0X-----X   snan
+    S  7FF    1X-----X   qnan
+   */
+   const ULong posInf  = 0x7FF0000000000000ULL;
+   const ULong negInf  = 0xFFF0000000000000ULL;
+   const ULong nanMask = 0x7FF0000000000000ULL;
+   const ULong qNan    = 0x7FF8000000000000ULL;
+   const ULong posZero = 0x0000000000000000ULL;
+   const ULong negZero = 0x8000000000000000ULL;
+   const ULong bit51   = 1ULL << 51;
+   const ULong bit52   = 1ULL << 52;
+   const ULong sigMask = bit52 - 1;
+
+   /* Mimic Core i5 behaviour for special cases. */
+   if (arg == posInf)
+      return getExp ? posInf : posInf;
+   if (arg == negInf)
+      return getExp ? posInf : negInf;
+   if ((arg & nanMask) == nanMask)
+      return qNan | (arg & (1ULL << 63));
+   if (arg == posZero)
+      return getExp ? negInf : posZero;
+   if (arg == negZero)
+      return getExp ? negInf : negZero;
+
+   /* Split into sign, exponent and significand. */
+   sign = ((UInt)(arg >> 63)) & 1;
+
+   /* Mask off exponent & sign. uSig is in range 0 .. 2^52-1. */
+   uSig = arg & sigMask;
+
+   /* Get the exponent. */
+   sExp = ((Int)(arg >> 52)) & 0x7FF;
+
+   /* Deal with denormals: if the exponent is zero, then the
+      significand cannot possibly be zero (negZero/posZero are handled
+      above).  Shift the significand left until bit 51 of it becomes
+      1, and decrease the exponent accordingly.
+   */
+   if (sExp == 0) {
+      for (i = 0; i < 52; i++) {
+         if (uSig & bit51)
+            break;
+         uSig <<= 1;
+         sExp--;
+      }
+      uSig <<= 1;
+   } else {
+      /* Add the implied leading-1 in the significand. */
+      uSig |= bit52;
+   }
+
+   /* Roll in the sign. */
+   /* sSig = uSig; */
+   /* if (sign) sSig =- sSig; */
+
+   /* Convert sig into a double.  This should be an exact conversion.
+      Then divide by 2^52, which should give a value in the range 1.0
+      to 2.0-epsilon, at least for normalised args. */
+   /* dSig = (Double)sSig; */
+   /* dSig /= 67108864.0;  */ /* 2^26 */
+   /* dSig /= 67108864.0;  */ /* 2^26 */
+   uSig &= sigMask;
+   uSig |= 0x3FF0000000000000ULL;
+   if (sign)
+      uSig ^= negZero;
+
+   /* Convert exp into a double.  Also an exact conversion. */
+   /* dExp = (Double)(sExp - 1023); */
+   sExp -= 1023;
+   if (sExp == 0) {
+      uExp = 0;
+   } else {
+      uExp   = sExp < 0 ? -sExp : sExp;
+      expExp = 0x3FF +52;
+      /* 1 <= uExp <= 1074 */
+      /* Skip first 42 iterations of normalisation loop as we know they
+         will always happen */
+      uExp <<= 42;
+      expExp -= 42;
+      for (i = 0; i < 52-42; i++) {
+         if (uExp & bit52)
+            break;
+         uExp <<= 1;
+         expExp--;
+      }
+      uExp &= sigMask;
+      uExp |= ((ULong)expExp) << 52;
+      if (sExp < 0) uExp ^= negZero;
+   }
+
+   return getExp ? uExp : uSig;
+}
+
+
+
+/*---------------------------------------------------------*/
+/*--- SSE4.2 PCMP{E,I}STR{I,M} helpers                  ---*/
+/*---------------------------------------------------------*/
+
+/* We need the definitions for OSZACP eflags/rflags offsets.
+   #including guest_{amd64,x86}_defs.h causes chaos, so just copy the
+   required values directly.  They are not going to change in the
+   foreseeable future :-)
+*/
+
+#define SHIFT_O   11
+#define SHIFT_S   7
+#define SHIFT_Z   6
+#define SHIFT_A   4
+#define SHIFT_C   0
+#define SHIFT_P   2
+
+#define MASK_O    (1 << SHIFT_O)
+#define MASK_S    (1 << SHIFT_S)
+#define MASK_Z    (1 << SHIFT_Z)
+#define MASK_A    (1 << SHIFT_A)
+#define MASK_C    (1 << SHIFT_C)
+#define MASK_P    (1 << SHIFT_P)
+
+
+/* Count leading zeroes, w/ 0-produces-32 semantics, a la Hacker's
+   Delight. */
+static UInt clz32 ( UInt x )
+{
+   Int y, m, n;
+   y = -(x >> 16);
+   m = (y >> 16) & 16;
+   n = 16 - m;
+   x = x >> m;
+   y = x - 0x100;
+   m = (y >> 16) & 8;
+   n = n + m;
+   x = x << m;
+   y = x - 0x1000;
+   m = (y >> 16) & 4;
+   n = n + m;
+   x = x << m;
+   y = x - 0x4000;
+   m = (y >> 16) & 2;
+   n = n + m;
+   x = x << m;
+   y = x >> 14;
+   m = y & ~(y >> 1);
+   return n + 2 - m;
+}
+
+static UInt ctz32 ( UInt x )
+{
+   return 32 - clz32((~x) & (x-1));
+}
+
+/* Convert a 4-bit value to a 32-bit value by cloning each bit 8
+   times.  There's surely a better way to do this, but I don't know
+   what it is. */
+static UInt bits4_to_bytes4 ( UInt bits4 )
+{
+   UInt r = 0;
+   r |= (bits4 & 1) ? 0x000000FF : 0;
+   r |= (bits4 & 2) ? 0x0000FF00 : 0;
+   r |= (bits4 & 4) ? 0x00FF0000 : 0;
+   r |= (bits4 & 8) ? 0xFF000000 : 0;
+   return r;
+}
+
+
+/* Given partial results from a pcmpXstrX operation (intRes1,
+   basically), generate an I- or M-format output value, also the new
+   OSZACP flags.  */
+static
+void compute_PCMPxSTRx_gen_output (/*OUT*/V128* resV,
+                                   /*OUT*/UInt* resOSZACP,
+                                   UInt intRes1,
+                                   UInt zmaskL, UInt zmaskR,
+                                   UInt validL,
+                                   UInt pol, UInt idx,
+                                   Bool isxSTRM )
+{
+   vassert((pol >> 2) == 0);
+   vassert((idx >> 1) == 0);
+
+   UInt intRes2 = 0;
+   switch (pol) {
+      case 0: intRes2 = intRes1;          break; // pol +
+      case 1: intRes2 = ~intRes1;         break; // pol -
+      case 2: intRes2 = intRes1;          break; // pol m+
+      case 3: intRes2 = intRes1 ^ validL; break; // pol m-
+   }
+   intRes2 &= 0xFFFF;
+
+   if (isxSTRM) {
+ 
+      // generate M-format output (a bit or byte mask in XMM0)
+      if (idx) {
+         resV->w32[0] = bits4_to_bytes4( (intRes2 >>  0) & 0xF );
+         resV->w32[1] = bits4_to_bytes4( (intRes2 >>  4) & 0xF );
+         resV->w32[2] = bits4_to_bytes4( (intRes2 >>  8) & 0xF );
+         resV->w32[3] = bits4_to_bytes4( (intRes2 >> 12) & 0xF );
+      } else {
+         resV->w32[0] = intRes2 & 0xFFFF;
+         resV->w32[1] = 0;
+         resV->w32[2] = 0;
+         resV->w32[3] = 0;
+      }
+
+   } else {
+
+      // generate I-format output (an index in ECX)
+      // generate ecx value
+      UInt newECX = 0;
+      if (idx) {
+         // index of ms-1-bit
+         newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
+      } else {
+         // index of ls-1-bit
+         newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
+      }
+
+      resV->w32[0] = newECX;
+      resV->w32[1] = 0;
+      resV->w32[2] = 0;
+      resV->w32[3] = 0;
+
+   }
+
+   // generate new flags, common to all ISTRI and ISTRM cases
+   *resOSZACP    // A, P are zero
+     = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
+     | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
+     | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
+     | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
+}
+
+
+/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
+   variants.
+
+   For xSTRI variants, the new ECX value is placed in the 32 bits
+   pointed to by *resV, and the top 96 bits are zeroed.  For xSTRM
+   variants, the result is a 128 bit value and is placed at *resV in
+   the obvious way.
+
+   For all variants, the new OSZACP value is placed at *resOSZACP.
+
+   argLV and argRV are the vector args.  The caller must prepare a
+   16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
+   must be 1 for each zero byte of of the respective arg.  For ESTRx
+   variants this is derived from the explicit length indication, and
+   must be 0 in all places except at the bit index corresponding to
+   the valid length (0 .. 16).  If the valid length is 16 then the
+   mask must be all zeroes.  In all cases, bits 31:16 must be zero.
+
+   imm8 is the original immediate from the instruction.  isSTRM
+   indicates whether this is a xSTRM or xSTRI variant, which controls
+   how much of *res is written.
+
+   If the given imm8 case can be handled, the return value is True.
+   If not, False is returned, and neither *res not *resOSZACP are
+   altered.
+*/
+
+Bool compute_PCMPxSTRx ( /*OUT*/V128* resV,
+                         /*OUT*/UInt* resOSZACP,
+                         V128* argLV,  V128* argRV,
+                         UInt zmaskL, UInt zmaskR,
+                         UInt imm8,   Bool isxSTRM )
+{
+   vassert(imm8 < 0x80);
+   vassert((zmaskL >> 16) == 0);
+   vassert((zmaskR >> 16) == 0);
+
+   /* Explicitly reject any imm8 values that haven't been validated,
+      even if they would probably work.  Life is too short to have
+      unvalidated cases in the code base. */
+   switch (imm8) {
+      case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12:
+      case 0x1A: case 0x3A: case 0x44: case 0x4A:
+         break;
+      default:
+         return False;
+   }
+
+   UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
+   UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
+   UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
+   UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
+
+   /*----------------------------------------*/
+   /*-- strcmp on byte data                --*/
+   /*----------------------------------------*/
+
+   if (agg == 2/*equal each, aka strcmp*/
+       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
+      Int    i;
+      UChar* argL = (UChar*)argLV;
+      UChar* argR = (UChar*)argRV;
+      UInt boolResII = 0;
+      for (i = 15; i >= 0; i--) {
+         UChar cL  = argL[i];
+         UChar cR  = argR[i];
+         boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
+      }
+      UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+      UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+
+      // do invalidation, common to all equal-each cases
+      UInt intRes1
+         = (boolResII & validL & validR)  // if both valid, use cmpres
+           | (~ (validL | validR));       // if both invalid, force 1
+                                          // else force 0
+      intRes1 &= 0xFFFF;
+
+      // generate I-format output
+      compute_PCMPxSTRx_gen_output(
+         resV, resOSZACP,
+         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
+      );
+
+      return True;
+   }
+
+   /*----------------------------------------*/
+   /*-- set membership on byte data        --*/
+   /*----------------------------------------*/
+
+   if (agg == 0/*equal any, aka find chars in a set*/
+       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
+      /* argL: the string,  argR: charset */
+      UInt   si, ci;
+      UChar* argL    = (UChar*)argLV;
+      UChar* argR    = (UChar*)argRV;
+      UInt   boolRes = 0;
+      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+
+      for (si = 0; si < 16; si++) {
+         if ((validL & (1 << si)) == 0)
+            // run off the end of the string.
+            break;
+         UInt m = 0;
+         for (ci = 0; ci < 16; ci++) {
+            if ((validR & (1 << ci)) == 0) break;
+            if (argR[ci] == argL[si]) { m = 1; break; }
+         }
+         boolRes |= (m << si);
+      }
+
+      // boolRes is "pre-invalidated"
+      UInt intRes1 = boolRes & 0xFFFF;
+   
+      // generate I-format output
+      compute_PCMPxSTRx_gen_output(
+         resV, resOSZACP,
+         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
+      );
+
+      return True;
+   }
+
+   /*----------------------------------------*/
+   /*-- substring search on byte data      --*/
+   /*----------------------------------------*/
+
+   if (agg == 3/*equal ordered, aka substring search*/
+       && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
+
+      /* argL: haystack,  argR: needle */
+      UInt   ni, hi;
+      UChar* argL    = (UChar*)argLV;
+      UChar* argR    = (UChar*)argRV;
+      UInt   boolRes = 0;
+      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+      for (hi = 0; hi < 16; hi++) {
+         if ((validL & (1 << hi)) == 0)
+            // run off the end of the haystack
+            break;
+         UInt m = 1;
+         for (ni = 0; ni < 16; ni++) {
+            if ((validR & (1 << ni)) == 0) break;
+            UInt i = ni + hi;
+            if (i >= 16) break;
+            if (argL[i] != argR[ni]) { m = 0; break; }
+         }
+         boolRes |= (m << hi);
+      }
+
+      // boolRes is "pre-invalidated"
+      UInt intRes1 = boolRes & 0xFFFF;
+
+      // generate I-format output
+      compute_PCMPxSTRx_gen_output(
+         resV, resOSZACP,
+         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
+      );
+
+      return True;
+   }
+
+   /*----------------------------------------*/
+   /*-- ranges, unsigned byte data         --*/
+   /*----------------------------------------*/
+
+   if (agg == 1/*ranges*/
+       && fmt == 0/*ub*/) {
+
+      /* argL: string,  argR: range-pairs */
+      UInt   ri, si;
+      UChar* argL    = (UChar*)argLV;
+      UChar* argR    = (UChar*)argRV;
+      UInt   boolRes = 0;
+      UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
+      UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
+      for (si = 0; si < 16; si++) {
+         if ((validL & (1 << si)) == 0)
+            // run off the end of the string
+            break;
+         UInt m = 0;
+         for (ri = 0; ri < 16; ri += 2) {
+            if ((validR & (3 << ri)) != (3 << ri)) break;
+            if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) { 
+               m = 1; break;
+            }
+         }
+         boolRes |= (m << si);
+      }
+
+      // boolRes is "pre-invalidated"
+      UInt intRes1 = boolRes & 0xFFFF;
+
+      // generate I-format output
+      compute_PCMPxSTRx_gen_output(
+         resV, resOSZACP,
+         intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
+      );
+
+      return True;
+   }
+
+   return False;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                 guest_generic_x87.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/guest_generic_x87.h b/VEX/priv/guest_generic_x87.h
new file mode 100644
index 0000000..9cbe23b
--- /dev/null
+++ b/VEX/priv/guest_generic_x87.h

@@ -0,0 +1,114 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                               guest_generic_x87.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+/* This file contains functions for doing some x87-specific
+   operations.  Both the amd64 and x86 front ends (guests) indirectly
+   call these functions via guest helper calls.  By putting them here,
+   code duplication is avoided.  Some of these functions are tricky
+   and hard to verify, so there is much to be said for only having one
+   copy thereof.
+*/
+
+#ifndef __VEX_GUEST_GENERIC_X87_H
+#define __VEX_GUEST_GENERIC_X87_H
+
+#include "libvex_basictypes.h"
+
+
+/* Convert an IEEE754 double (64-bit) into an x87 extended double
+   (80-bit), mimicing the hardware fairly closely.  Both numbers are
+   stored little-endian.  Limitations, all of which could be fixed,
+   given some level of hassle:
+
+   * Identity of NaNs is not preserved.
+
+   See comments in the code for more details.
+*/
+extern
+void convert_f64le_to_f80le ( /*IN*/UChar* f64, /*OUT*/UChar* f80 );
+
+
+/* Convert an x87 extended double (80-bit) into an IEEE 754 double
+   (64-bit), mimicking the hardware fairly closely.  Both numbers are
+   stored little-endian.  Limitations, both of which could be fixed,
+   given some level of hassle:
+
+   * Rounding following truncation could be a bit better.
+
+   * Identity of NaNs is not preserved.
+
+   See comments in the code for more details.
+*/
+extern
+void convert_f80le_to_f64le ( /*IN*/UChar* f80, /*OUT*/UChar* f64 );
+
+
+/* Layout of the real x87 state. */
+typedef
+   struct {
+      UShort env[14];
+      UChar  reg[80];
+   }
+   Fpu_State;
+
+/* Offsets, in 16-bit ints, into the FPU environment (env) area. */
+#define FP_ENV_CTRL   0
+#define FP_ENV_STAT   2
+#define FP_ENV_TAG    4
+#define FP_ENV_IP     6 /* and 7 */
+#define FP_ENV_CS     8
+#define FP_ENV_OPOFF  10 /* and 11 */
+#define FP_ENV_OPSEL  12
+#define FP_REG(ii)    (10*(7-(ii)))
+
+
+/* Do the computations for x86/amd64 FXTRACT.  Called directly from
+   generated code.  CLEAN HELPER. */
+extern ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp );
+
+/* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
+   variants.  See bigger comment on implementation of this function
+   for details on call/return conventions. */
+extern Bool compute_PCMPxSTRx ( /*OUT*/V128* resV,
+                                /*OUT*/UInt* resOSZACP,
+                                V128* argLV,  V128* argRV,
+                                UInt zmaskL, UInt zmaskR,
+                                UInt imm8,   Bool isxSTRM );
+
+#endif /* ndef __VEX_GUEST_GENERIC_X87_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                 guest_generic_x87.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/guest_ppc_defs.h b/VEX/priv/guest_ppc_defs.h
new file mode 100644
index 0000000..dd3c62e
--- /dev/null
+++ b/VEX/priv/guest_ppc_defs.h

@@ -0,0 +1,161 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                  guest_ppc_defs.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+/* Only to be used within the guest-ppc directory. */
+
+
+#ifndef __VEX_GUEST_PPC_DEFS_H
+#define __VEX_GUEST_PPC_DEFS_H
+
+
+/*---------------------------------------------------------*/
+/*--- ppc to IR conversion                              ---*/
+/*---------------------------------------------------------*/
+
+/* Convert one ppc insn to IR.  See the type DisOneInstrFn in
+   bb_to_IR.h. */
+extern
+DisResult disInstr_PPC ( IRSB*        irbb,
+                         Bool         put_IP,
+                         Bool         (*resteerOkFn) ( void*, Addr64 ),
+                         Bool         resteerCisOk,
+                         void*        callback_opaque,
+                         UChar*       guest_code,
+                         Long         delta,
+                         Addr64       guest_IP,
+                         VexArch      guest_arch,
+                         VexArchInfo* archinfo,
+                         VexAbiInfo*  abiinfo,
+                         Bool         host_bigendian );
+
+/* Used by the optimiser to specialise calls to helpers. */
+extern
+IRExpr* guest_ppc32_spechelper ( HChar*   function_name,
+                                 IRExpr** args,
+                                 IRStmt** precedingStmts,
+                                 Int      n_precedingStmts );
+
+extern
+IRExpr* guest_ppc64_spechelper ( HChar*   function_name,
+                                 IRExpr** args,
+                                 IRStmt** precedingStmts,
+                                 Int      n_precedingStmts );
+
+/* Describes to the optimser which part of the guest state require
+   precise memory exceptions.  This is logically part of the guest
+   state description. */
+extern 
+Bool guest_ppc32_state_requires_precise_mem_exns ( Int, Int );
+
+extern 
+Bool guest_ppc64_state_requires_precise_mem_exns ( Int, Int );
+
+extern
+VexGuestLayout ppc32Guest_layout;
+
+extern
+VexGuestLayout ppc64Guest_layout;
+
+
+/* FP Rounding mode - different encoding to IR */
+typedef
+   enum {
+      PPCrm_NEAREST = 0,
+      PPCrm_NegINF  = 1,
+      PPCrm_PosINF  = 2,
+      PPCrm_ZERO    = 3
+   } PPCRoundingMode;
+
+/* Floating point comparison values - different encoding to IR */
+typedef
+   enum {
+      PPCcr_LT = 0x8,
+      PPCcr_GT = 0x4,
+      PPCcr_EQ = 0x2,
+      PPCcr_UN = 0x1
+   }
+   PPCCmpF64Result;
+
+/*
+  Enumeration for xer_ca/ov calculation helper functions
+*/
+enum {
+   /* 0  */ PPCG_FLAG_OP_ADD=0,   // addc[o], addic
+   /* 1  */ PPCG_FLAG_OP_ADDE,    // adde[o], addme[o], addze[o]
+   /* 2  */ PPCG_FLAG_OP_DIVW,    // divwo
+   /* 3  */ PPCG_FLAG_OP_DIVWU,   // divwuo
+   /* 4  */ PPCG_FLAG_OP_MULLW,   // mullwo
+   /* 5  */ PPCG_FLAG_OP_NEG,     // nego
+   /* 6  */ PPCG_FLAG_OP_SUBF,    // subfo
+   /* 7  */ PPCG_FLAG_OP_SUBFC,   // subfc[o]
+   /* 8  */ PPCG_FLAG_OP_SUBFE,   // subfe[o], subfme[o], subfze[o]
+   /* 9  */ PPCG_FLAG_OP_SUBFI,   // subfic
+   /* 10 */ PPCG_FLAG_OP_SRAW,    // sraw
+   /* 11 */ PPCG_FLAG_OP_SRAWI,   // srawi
+   /* 12 */ PPCG_FLAG_OP_SRAD,    // srad
+   /* 13 */ PPCG_FLAG_OP_SRADI,   // sradi
+   PPCG_FLAG_OP_NUMBER
+};
+
+
+/*---------------------------------------------------------*/
+/*--- ppc guest helpers                                 ---*/
+/*---------------------------------------------------------*/
+
+/* --- CLEAN HELPERS --- */
+
+/* none, right now */
+
+/* --- DIRTY HELPERS --- */
+
+extern ULong ppcg_dirtyhelper_MFTB ( void );
+
+extern UInt ppc32g_dirtyhelper_MFSPR_268_269 ( UInt );
+
+extern UInt ppc32g_dirtyhelper_MFSPR_287 ( void );
+
+extern void ppc32g_dirtyhelper_LVS ( VexGuestPPC32State* gst,
+                                     UInt vD_idx, UInt sh,
+                                     UInt shift_right );
+
+extern void ppc64g_dirtyhelper_LVS ( VexGuestPPC64State* gst,
+                                     UInt vD_idx, UInt sh,
+                                     UInt shift_right );
+
+#endif /* ndef __VEX_GUEST_PPC_DEFS_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                    guest_ppc_defs.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/guest_ppc_helpers.c b/VEX/priv/guest_ppc_helpers.c
new file mode 100644
index 0000000..e056a65
--- /dev/null
+++ b/VEX/priv/guest_ppc_helpers.c

@@ -0,0 +1,837 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                               guest_ppc_helpers.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_emwarn.h"
+#include "libvex_guest_ppc32.h"
+#include "libvex_guest_ppc64.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+
+#include "main_util.h"
+#include "guest_generic_bb_to_IR.h"
+#include "guest_ppc_defs.h"
+
+
+/* This file contains helper functions for ppc32 and ppc64 guest code.
+   Calls to these functions are generated by the back end.  These
+   calls are of course in the host machine code and this file will be
+   compiled to host machine code, so that all makes sense.
+
+   Only change the signatures of these helper functions very
+   carefully.  If you change the signature here, you'll have to change
+   the parameters passed to it in the IR calls constructed by
+   guest-ppc/toIR.c.
+*/
+
+
+/*---------------------------------------------------------------*/
+/*--- Misc integer helpers.                                   ---*/
+/*---------------------------------------------------------------*/
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (non-referentially-transparent) */
+/* Horrible hack.  On non-ppc platforms, return 1. */
+/* Reads a complete, consistent 64-bit TB value. */
+ULong ppcg_dirtyhelper_MFTB ( void )
+{
+#  if defined(__powerpc__) || defined(_AIX)
+   ULong res;
+   UInt  lo, hi1, hi2;
+   while (1) {
+      __asm__ __volatile__ ("\n"
+         "\tmftbu %0\n"
+         "\tmftb %1\n"
+         "\tmftbu %2\n"
+         : "=r" (hi1), "=r" (lo), "=r" (hi2)
+      );
+      if (hi1 == hi2) break;
+   }
+   res = ((ULong)hi1) << 32;
+   res |= (ULong)lo;
+   return res;
+#  else
+   return 1ULL;
+#  endif
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (non-referentially transparent) */
+UInt ppc32g_dirtyhelper_MFSPR_268_269 ( UInt r269 )
+{
+#  if defined(__powerpc__) || defined(_AIX)
+   UInt spr;
+   if (r269) {
+      __asm__ __volatile__("mfspr %0,269" : "=b"(spr));
+   } else {
+      __asm__ __volatile__("mfspr %0,268" : "=b"(spr));
+   }
+   return spr;
+#  else
+   return 0;
+#  endif
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (I'm not really sure what the side effects are) */
+UInt ppc32g_dirtyhelper_MFSPR_287 ( void )
+{
+#  if defined(__powerpc__) || defined(_AIX)
+   UInt spr;
+   __asm__ __volatile__("mfspr %0,287" : "=b"(spr));
+   return spr;
+#  else
+   return 0;
+#  endif
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest state, writes guest mem) */
+void ppc32g_dirtyhelper_LVS ( VexGuestPPC32State* gst,
+                              UInt vD_off, UInt sh, UInt shift_right )
+{
+  static
+  UChar ref[32] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                    0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+                    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+                    0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F };
+  U128* pU128_src;
+  U128* pU128_dst;
+
+  vassert( vD_off       <= sizeof(VexGuestPPC32State)-8 );
+  vassert( sh           <= 15 );
+  vassert( shift_right  <=  1 );
+  if (shift_right)
+     sh = 16-sh;
+  /* else shift left  */
+
+  pU128_src = (U128*)&ref[sh];
+  pU128_dst = (U128*)( ((UChar*)gst) + vD_off );
+
+  (*pU128_dst)[0] = (*pU128_src)[0];
+  (*pU128_dst)[1] = (*pU128_src)[1];
+  (*pU128_dst)[2] = (*pU128_src)[2];
+  (*pU128_dst)[3] = (*pU128_src)[3];
+}
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest state, writes guest mem) */
+void ppc64g_dirtyhelper_LVS ( VexGuestPPC64State* gst,
+                              UInt vD_off, UInt sh, UInt shift_right )
+{
+  static
+  UChar ref[32] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+                    0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+                    0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
+                    0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F };
+  U128* pU128_src;
+  U128* pU128_dst;
+
+  vassert( vD_off       <= sizeof(VexGuestPPC64State)-8 );
+  vassert( sh           <= 15 );
+  vassert( shift_right  <=  1 );
+  if (shift_right)
+     sh = 16-sh;
+  /* else shift left  */
+
+  pU128_src = (U128*)&ref[sh];
+  pU128_dst = (U128*)( ((UChar*)gst) + vD_off );
+
+  (*pU128_dst)[0] = (*pU128_src)[0];
+  (*pU128_dst)[1] = (*pU128_src)[1];
+  (*pU128_dst)[2] = (*pU128_src)[2];
+  (*pU128_dst)[3] = (*pU128_src)[3];
+}
+
+
+/* Helper-function specialiser. */
+
+IRExpr* guest_ppc32_spechelper ( HChar* function_name,
+                                 IRExpr** args,
+                                 IRStmt** precedingStmts,
+                                 Int      n_precedingStmts )
+{
+   return NULL;
+}
+
+IRExpr* guest_ppc64_spechelper ( HChar* function_name,
+                                 IRExpr** args,
+                                 IRStmt** precedingStmts,
+                                 Int      n_precedingStmts )
+{
+   return NULL;
+}
+
+
+/*----------------------------------------------*/
+/*--- The exported fns ..                    ---*/
+/*----------------------------------------------*/
+
+/* VISIBLE TO LIBVEX CLIENT */
+UInt LibVEX_GuestPPC32_get_CR ( /*IN*/VexGuestPPC32State* vex_state )
+{
+#  define FIELD(_n)                                    \
+      ( ( (UInt)                                       \
+           ( (vex_state->guest_CR##_n##_321 & (7<<1))  \
+             | (vex_state->guest_CR##_n##_0 & 1)       \
+           )                                           \
+        )                                              \
+        << (4 * (7-(_n)))                              \
+      )
+
+   return 
+      FIELD(0) | FIELD(1) | FIELD(2) | FIELD(3)
+      | FIELD(4) | FIELD(5) | FIELD(6) | FIELD(7);
+
+#  undef FIELD
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+/* Note: %CR is 32 bits even for ppc64 */
+UInt LibVEX_GuestPPC64_get_CR ( /*IN*/VexGuestPPC64State* vex_state )
+{
+#  define FIELD(_n)                                    \
+      ( ( (UInt)                                       \
+           ( (vex_state->guest_CR##_n##_321 & (7<<1))  \
+             | (vex_state->guest_CR##_n##_0 & 1)       \
+           )                                           \
+        )                                              \
+        << (4 * (7-(_n)))                              \
+      )
+
+   return 
+      FIELD(0) | FIELD(1) | FIELD(2) | FIELD(3)
+      | FIELD(4) | FIELD(5) | FIELD(6) | FIELD(7);
+
+#  undef FIELD
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+void LibVEX_GuestPPC32_put_CR ( UInt cr_native,
+                                /*OUT*/VexGuestPPC32State* vex_state )
+{
+   UInt t;
+
+#  define FIELD(_n)                                           \
+      do {                                                    \
+         t = cr_native >> (4*(7-(_n)));                       \
+         vex_state->guest_CR##_n##_0 = toUChar(t & 1);        \
+         vex_state->guest_CR##_n##_321 = toUChar(t & (7<<1)); \
+      } while (0)
+
+   FIELD(0);
+   FIELD(1);
+   FIELD(2);
+   FIELD(3);
+   FIELD(4);
+   FIELD(5);
+   FIELD(6);
+   FIELD(7);
+
+#  undef FIELD
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+/* Note: %CR is 32 bits even for ppc64 */
+void LibVEX_GuestPPC64_put_CR ( UInt cr_native,
+                                /*OUT*/VexGuestPPC64State* vex_state )
+{
+   UInt t;
+
+#  define FIELD(_n)                                           \
+      do {                                                    \
+         t = cr_native >> (4*(7-(_n)));                       \
+         vex_state->guest_CR##_n##_0 = toUChar(t & 1);        \
+         vex_state->guest_CR##_n##_321 = toUChar(t & (7<<1)); \
+      } while (0)
+
+   FIELD(0);
+   FIELD(1);
+   FIELD(2);
+   FIELD(3);
+   FIELD(4);
+   FIELD(5);
+   FIELD(6);
+   FIELD(7);
+
+#  undef FIELD
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+UInt LibVEX_GuestPPC32_get_XER ( /*IN*/VexGuestPPC32State* vex_state )
+{
+   UInt w = 0;
+   w |= ( ((UInt)vex_state->guest_XER_BC) & 0xFF );
+   w |= ( (((UInt)vex_state->guest_XER_SO) & 0x1) << 31 );
+   w |= ( (((UInt)vex_state->guest_XER_OV) & 0x1) << 30 );
+   w |= ( (((UInt)vex_state->guest_XER_CA) & 0x1) << 29 );
+   return w;
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+/* Note: %XER is 32 bits even for ppc64 */
+UInt LibVEX_GuestPPC64_get_XER ( /*IN*/VexGuestPPC64State* vex_state )
+{
+   UInt w = 0;
+   w |= ( ((UInt)vex_state->guest_XER_BC) & 0xFF );
+   w |= ( (((UInt)vex_state->guest_XER_SO) & 0x1) << 31 );
+   w |= ( (((UInt)vex_state->guest_XER_OV) & 0x1) << 30 );
+   w |= ( (((UInt)vex_state->guest_XER_CA) & 0x1) << 29 );
+   return w;
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+void LibVEX_GuestPPC32_put_XER ( UInt xer_native,
+                                 /*OUT*/VexGuestPPC32State* vex_state )
+{
+   vex_state->guest_XER_BC = toUChar(xer_native & 0xFF);
+   vex_state->guest_XER_SO = toUChar((xer_native >> 31) & 0x1);
+   vex_state->guest_XER_OV = toUChar((xer_native >> 30) & 0x1);
+   vex_state->guest_XER_CA = toUChar((xer_native >> 29) & 0x1);
+}
+
+/* VISIBLE TO LIBVEX CLIENT */
+/* Note: %XER is 32 bits even for ppc64 */
+void LibVEX_GuestPPC64_put_XER ( UInt xer_native,
+                                 /*OUT*/VexGuestPPC64State* vex_state )
+{
+   vex_state->guest_XER_BC = toUChar(xer_native & 0xFF);
+   vex_state->guest_XER_SO = toUChar((xer_native >> 31) & 0x1);
+   vex_state->guest_XER_OV = toUChar((xer_native >> 30) & 0x1);
+   vex_state->guest_XER_CA = toUChar((xer_native >> 29) & 0x1);
+}
+
+/* VISIBLE TO LIBVEX CLIENT */
+void LibVEX_GuestPPC32_initialise ( /*OUT*/VexGuestPPC32State* vex_state )
+{
+   Int i;
+   vex_state->guest_GPR0  = 0;
+   vex_state->guest_GPR1  = 0;
+   vex_state->guest_GPR2  = 0;
+   vex_state->guest_GPR3  = 0;
+   vex_state->guest_GPR4  = 0;
+   vex_state->guest_GPR5  = 0;
+   vex_state->guest_GPR6  = 0;
+   vex_state->guest_GPR7  = 0;
+   vex_state->guest_GPR8  = 0;
+   vex_state->guest_GPR9  = 0;
+   vex_state->guest_GPR10 = 0;
+   vex_state->guest_GPR11 = 0;
+   vex_state->guest_GPR12 = 0;
+   vex_state->guest_GPR13 = 0;
+   vex_state->guest_GPR14 = 0;
+   vex_state->guest_GPR15 = 0;
+   vex_state->guest_GPR16 = 0;
+   vex_state->guest_GPR17 = 0;
+   vex_state->guest_GPR18 = 0;
+   vex_state->guest_GPR19 = 0;
+   vex_state->guest_GPR20 = 0;
+   vex_state->guest_GPR21 = 0;
+   vex_state->guest_GPR22 = 0;
+   vex_state->guest_GPR23 = 0;
+   vex_state->guest_GPR24 = 0;
+   vex_state->guest_GPR25 = 0;
+   vex_state->guest_GPR26 = 0;
+   vex_state->guest_GPR27 = 0;
+   vex_state->guest_GPR28 = 0;
+   vex_state->guest_GPR29 = 0;
+   vex_state->guest_GPR30 = 0;
+   vex_state->guest_GPR31 = 0;
+
+   vex_state->guest_FPR0  = 0;
+   vex_state->guest_FPR1  = 0;
+   vex_state->guest_FPR2  = 0;
+   vex_state->guest_FPR3  = 0;
+   vex_state->guest_FPR4  = 0;
+   vex_state->guest_FPR5  = 0;
+   vex_state->guest_FPR6  = 0;
+   vex_state->guest_FPR7  = 0;
+   vex_state->guest_FPR8  = 0;
+   vex_state->guest_FPR9  = 0;
+   vex_state->guest_FPR10 = 0;
+   vex_state->guest_FPR11 = 0;
+   vex_state->guest_FPR12 = 0;
+   vex_state->guest_FPR13 = 0;
+   vex_state->guest_FPR14 = 0;
+   vex_state->guest_FPR15 = 0;
+   vex_state->guest_FPR16 = 0;
+   vex_state->guest_FPR17 = 0;
+   vex_state->guest_FPR18 = 0;
+   vex_state->guest_FPR19 = 0;
+   vex_state->guest_FPR20 = 0;
+   vex_state->guest_FPR21 = 0;
+   vex_state->guest_FPR22 = 0;
+   vex_state->guest_FPR23 = 0;
+   vex_state->guest_FPR24 = 0;
+   vex_state->guest_FPR25 = 0;
+   vex_state->guest_FPR26 = 0;
+   vex_state->guest_FPR27 = 0;
+   vex_state->guest_FPR28 = 0;
+   vex_state->guest_FPR29 = 0;
+   vex_state->guest_FPR30 = 0;
+   vex_state->guest_FPR31 = 0;
+
+   /* Initialise the vector state. */
+#  define VECZERO(_vr) _vr[0]=_vr[1]=_vr[2]=_vr[3] = 0;
+
+   VECZERO(vex_state->guest_VR0 );
+   VECZERO(vex_state->guest_VR1 );
+   VECZERO(vex_state->guest_VR2 );
+   VECZERO(vex_state->guest_VR3 );
+   VECZERO(vex_state->guest_VR4 );
+   VECZERO(vex_state->guest_VR5 );
+   VECZERO(vex_state->guest_VR6 );
+   VECZERO(vex_state->guest_VR7 );
+   VECZERO(vex_state->guest_VR8 );
+   VECZERO(vex_state->guest_VR9 );
+   VECZERO(vex_state->guest_VR10);
+   VECZERO(vex_state->guest_VR11);
+   VECZERO(vex_state->guest_VR12);
+   VECZERO(vex_state->guest_VR13);
+   VECZERO(vex_state->guest_VR14);
+   VECZERO(vex_state->guest_VR15);
+   VECZERO(vex_state->guest_VR16);
+   VECZERO(vex_state->guest_VR17);
+   VECZERO(vex_state->guest_VR18);
+   VECZERO(vex_state->guest_VR19);
+   VECZERO(vex_state->guest_VR20);
+   VECZERO(vex_state->guest_VR21);
+   VECZERO(vex_state->guest_VR22);
+   VECZERO(vex_state->guest_VR23);
+   VECZERO(vex_state->guest_VR24);
+   VECZERO(vex_state->guest_VR25);
+   VECZERO(vex_state->guest_VR26);
+   VECZERO(vex_state->guest_VR27);
+   VECZERO(vex_state->guest_VR28);
+   VECZERO(vex_state->guest_VR29);
+   VECZERO(vex_state->guest_VR30);
+   VECZERO(vex_state->guest_VR31);
+
+#  undef VECZERO
+
+   vex_state->guest_CIA  = 0;
+   vex_state->guest_LR   = 0;
+   vex_state->guest_CTR  = 0;
+
+   vex_state->guest_XER_SO = 0;
+   vex_state->guest_XER_OV = 0;
+   vex_state->guest_XER_CA = 0;
+   vex_state->guest_XER_BC = 0;
+
+   vex_state->guest_CR0_321 = 0;
+   vex_state->guest_CR0_0   = 0;
+   vex_state->guest_CR1_321 = 0;
+   vex_state->guest_CR1_0   = 0;
+   vex_state->guest_CR2_321 = 0;
+   vex_state->guest_CR2_0   = 0;
+   vex_state->guest_CR3_321 = 0;
+   vex_state->guest_CR3_0   = 0;
+   vex_state->guest_CR4_321 = 0;
+   vex_state->guest_CR4_0   = 0;
+   vex_state->guest_CR5_321 = 0;
+   vex_state->guest_CR5_0   = 0;
+   vex_state->guest_CR6_321 = 0;
+   vex_state->guest_CR6_0   = 0;
+   vex_state->guest_CR7_321 = 0;
+   vex_state->guest_CR7_0   = 0;
+
+   vex_state->guest_FPROUND = (UInt)PPCrm_NEAREST;
+
+   vex_state->guest_VRSAVE = 0;
+
+   vex_state->guest_VSCR = 0x0;  // Non-Java mode = 0
+
+   vex_state->guest_EMWARN = EmWarn_NONE;
+
+   vex_state->guest_TISTART = 0;
+   vex_state->guest_TILEN   = 0;
+
+   vex_state->guest_NRADDR = 0;
+   vex_state->guest_NRADDR_GPR2 = 0;
+
+   vex_state->guest_REDIR_SP = -1;
+   for (i = 0; i < VEX_GUEST_PPC32_REDIR_STACK_SIZE; i++)
+      vex_state->guest_REDIR_STACK[i] = 0;
+
+   vex_state->guest_IP_AT_SYSCALL = 0;
+   vex_state->guest_SPRG3_RO = 0;
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+void LibVEX_GuestPPC64_initialise ( /*OUT*/VexGuestPPC64State* vex_state )
+{
+   Int i;
+   vex_state->guest_GPR0  = 0;
+   vex_state->guest_GPR1  = 0;
+   vex_state->guest_GPR2  = 0;
+   vex_state->guest_GPR3  = 0;
+   vex_state->guest_GPR4  = 0;
+   vex_state->guest_GPR5  = 0;
+   vex_state->guest_GPR6  = 0;
+   vex_state->guest_GPR7  = 0;
+   vex_state->guest_GPR8  = 0;
+   vex_state->guest_GPR9  = 0;
+   vex_state->guest_GPR10 = 0;
+   vex_state->guest_GPR11 = 0;
+   vex_state->guest_GPR12 = 0;
+   vex_state->guest_GPR13 = 0;
+   vex_state->guest_GPR14 = 0;
+   vex_state->guest_GPR15 = 0;
+   vex_state->guest_GPR16 = 0;
+   vex_state->guest_GPR17 = 0;
+   vex_state->guest_GPR18 = 0;
+   vex_state->guest_GPR19 = 0;
+   vex_state->guest_GPR20 = 0;
+   vex_state->guest_GPR21 = 0;
+   vex_state->guest_GPR22 = 0;
+   vex_state->guest_GPR23 = 0;
+   vex_state->guest_GPR24 = 0;
+   vex_state->guest_GPR25 = 0;
+   vex_state->guest_GPR26 = 0;
+   vex_state->guest_GPR27 = 0;
+   vex_state->guest_GPR28 = 0;
+   vex_state->guest_GPR29 = 0;
+   vex_state->guest_GPR30 = 0;
+   vex_state->guest_GPR31 = 0;
+
+   vex_state->guest_FPR0  = 0;
+   vex_state->guest_FPR1  = 0;
+   vex_state->guest_FPR2  = 0;
+   vex_state->guest_FPR3  = 0;
+   vex_state->guest_FPR4  = 0;
+   vex_state->guest_FPR5  = 0;
+   vex_state->guest_FPR6  = 0;
+   vex_state->guest_FPR7  = 0;
+   vex_state->guest_FPR8  = 0;
+   vex_state->guest_FPR9  = 0;
+   vex_state->guest_FPR10 = 0;
+   vex_state->guest_FPR11 = 0;
+   vex_state->guest_FPR12 = 0;
+   vex_state->guest_FPR13 = 0;
+   vex_state->guest_FPR14 = 0;
+   vex_state->guest_FPR15 = 0;
+   vex_state->guest_FPR16 = 0;
+   vex_state->guest_FPR17 = 0;
+   vex_state->guest_FPR18 = 0;
+   vex_state->guest_FPR19 = 0;
+   vex_state->guest_FPR20 = 0;
+   vex_state->guest_FPR21 = 0;
+   vex_state->guest_FPR22 = 0;
+   vex_state->guest_FPR23 = 0;
+   vex_state->guest_FPR24 = 0;
+   vex_state->guest_FPR25 = 0;
+   vex_state->guest_FPR26 = 0;
+   vex_state->guest_FPR27 = 0;
+   vex_state->guest_FPR28 = 0;
+   vex_state->guest_FPR29 = 0;
+   vex_state->guest_FPR30 = 0;
+   vex_state->guest_FPR31 = 0;
+
+   /* Initialise the vector state. */
+#  define VECZERO(_vr) _vr[0]=_vr[1]=_vr[2]=_vr[3] = 0;
+
+   VECZERO(vex_state->guest_VR0 );
+   VECZERO(vex_state->guest_VR1 );
+   VECZERO(vex_state->guest_VR2 );
+   VECZERO(vex_state->guest_VR3 );
+   VECZERO(vex_state->guest_VR4 );
+   VECZERO(vex_state->guest_VR5 );
+   VECZERO(vex_state->guest_VR6 );
+   VECZERO(vex_state->guest_VR7 );
+   VECZERO(vex_state->guest_VR8 );
+   VECZERO(vex_state->guest_VR9 );
+   VECZERO(vex_state->guest_VR10);
+   VECZERO(vex_state->guest_VR11);
+   VECZERO(vex_state->guest_VR12);
+   VECZERO(vex_state->guest_VR13);
+   VECZERO(vex_state->guest_VR14);
+   VECZERO(vex_state->guest_VR15);
+   VECZERO(vex_state->guest_VR16);
+   VECZERO(vex_state->guest_VR17);
+   VECZERO(vex_state->guest_VR18);
+   VECZERO(vex_state->guest_VR19);
+   VECZERO(vex_state->guest_VR20);
+   VECZERO(vex_state->guest_VR21);
+   VECZERO(vex_state->guest_VR22);
+   VECZERO(vex_state->guest_VR23);
+   VECZERO(vex_state->guest_VR24);
+   VECZERO(vex_state->guest_VR25);
+   VECZERO(vex_state->guest_VR26);
+   VECZERO(vex_state->guest_VR27);
+   VECZERO(vex_state->guest_VR28);
+   VECZERO(vex_state->guest_VR29);
+   VECZERO(vex_state->guest_VR30);
+   VECZERO(vex_state->guest_VR31);
+
+#  undef VECZERO
+
+   vex_state->guest_CIA  = 0;
+   vex_state->guest_LR   = 0;
+   vex_state->guest_CTR  = 0;
+
+   vex_state->guest_XER_SO = 0;
+   vex_state->guest_XER_OV = 0;
+   vex_state->guest_XER_CA = 0;
+   vex_state->guest_XER_BC = 0;
+
+   vex_state->guest_CR0_321 = 0;
+   vex_state->guest_CR0_0   = 0;
+   vex_state->guest_CR1_321 = 0;
+   vex_state->guest_CR1_0   = 0;
+   vex_state->guest_CR2_321 = 0;
+   vex_state->guest_CR2_0   = 0;
+   vex_state->guest_CR3_321 = 0;
+   vex_state->guest_CR3_0   = 0;
+   vex_state->guest_CR4_321 = 0;
+   vex_state->guest_CR4_0   = 0;
+   vex_state->guest_CR5_321 = 0;
+   vex_state->guest_CR5_0   = 0;
+   vex_state->guest_CR6_321 = 0;
+   vex_state->guest_CR6_0   = 0;
+   vex_state->guest_CR7_321 = 0;
+   vex_state->guest_CR7_0   = 0;
+
+   vex_state->guest_FPROUND = (UInt)PPCrm_NEAREST;
+
+   vex_state->guest_VRSAVE = 0;
+
+   vex_state->guest_VSCR = 0x0;  // Non-Java mode = 0
+
+   vex_state->guest_EMWARN = EmWarn_NONE;
+
+   vex_state->padding = 0;
+
+   vex_state->guest_TISTART = 0;
+   vex_state->guest_TILEN   = 0;
+
+   vex_state->guest_NRADDR = 0;
+   vex_state->guest_NRADDR_GPR2 = 0;
+
+   vex_state->guest_REDIR_SP = -1;
+   for (i = 0; i < VEX_GUEST_PPC64_REDIR_STACK_SIZE; i++)
+      vex_state->guest_REDIR_STACK[i] = 0;
+
+   vex_state->guest_IP_AT_SYSCALL = 0;
+   vex_state->guest_SPRG3_RO = 0;
+
+   vex_state->padding2 = 0;
+}
+
+
+/*-----------------------------------------------------------*/
+/*--- Describing the ppc guest state, for the benefit     ---*/
+/*--- of iropt and instrumenters.                         ---*/
+/*-----------------------------------------------------------*/
+
+/* Figure out if any part of the guest state contained in minoff
+   .. maxoff requires precise memory exceptions.  If in doubt return
+   True (but this is generates significantly slower code).  
+
+   By default we enforce precise exns for guest R1 (stack pointer),
+   CIA (current insn address) and LR (link register).  These are the
+   minimum needed to extract correct stack backtraces from ppc
+   code. [[NB: not sure if keeping LR up to date is actually
+   necessary.]]
+*/
+Bool guest_ppc32_state_requires_precise_mem_exns ( Int minoff, 
+                                                   Int maxoff )
+{
+   Int lr_min  = offsetof(VexGuestPPC32State, guest_LR);
+   Int lr_max  = lr_min + 4 - 1;
+   Int r1_min  = offsetof(VexGuestPPC32State, guest_GPR1);
+   Int r1_max  = r1_min + 4 - 1;
+   Int cia_min = offsetof(VexGuestPPC32State, guest_CIA);
+   Int cia_max = cia_min + 4 - 1;
+
+   if (maxoff < lr_min || minoff > lr_max) {
+      /* no overlap with LR */
+   } else {
+      return True;
+   }
+
+   if (maxoff < r1_min || minoff > r1_max) {
+      /* no overlap with R1 */
+   } else {
+      return True;
+   }
+
+   if (maxoff < cia_min || minoff > cia_max) {
+      /* no overlap with CIA */
+   } else {
+      return True;
+   }
+
+   return False;
+}
+
+Bool guest_ppc64_state_requires_precise_mem_exns ( Int minoff, 
+                                                   Int maxoff )
+{
+   /* Given that R2 is a Big Deal in the ELF ppc64 ABI, it seems
+      prudent to be conservative with it, even though thus far there
+      is no evidence to suggest that it actually needs to be kept up
+      to date wrt possible exceptions. */
+   Int lr_min  = offsetof(VexGuestPPC64State, guest_LR);
+   Int lr_max  = lr_min + 8 - 1;
+   Int r1_min  = offsetof(VexGuestPPC64State, guest_GPR1);
+   Int r1_max  = r1_min + 8 - 1;
+   Int r2_min  = offsetof(VexGuestPPC64State, guest_GPR2);
+   Int r2_max  = r2_min + 8 - 1;
+   Int cia_min = offsetof(VexGuestPPC64State, guest_CIA);
+   Int cia_max = cia_min + 8 - 1;
+
+   if (maxoff < lr_min || minoff > lr_max) {
+      /* no overlap with LR */
+   } else {
+      return True;
+   }
+
+   if (maxoff < r1_min || minoff > r1_max) {
+      /* no overlap with R1 */
+   } else {
+      return True;
+   }
+
+   if (maxoff < r2_min || minoff > r2_max) {
+      /* no overlap with R2 */
+   } else {
+      return True;
+   }
+
+   if (maxoff < cia_min || minoff > cia_max) {
+      /* no overlap with CIA */
+   } else {
+      return True;
+   }
+
+   return False;
+}
+
+
+#define ALWAYSDEFD32(field)                           \
+    { offsetof(VexGuestPPC32State, field),            \
+      (sizeof ((VexGuestPPC32State*)0)->field) }
+
+VexGuestLayout
+   ppc32Guest_layout 
+      = { 
+          /* Total size of the guest state, in bytes. */
+          .total_sizeB = sizeof(VexGuestPPC32State),
+
+          /* Describe the stack pointer. */
+          .offset_SP = offsetof(VexGuestPPC32State,guest_GPR1),
+          .sizeof_SP = 4,
+
+          /* Describe the frame pointer. */
+          .offset_FP = offsetof(VexGuestPPC32State,guest_GPR1),
+          .sizeof_FP = 4,
+
+          /* Describe the instruction pointer. */
+          .offset_IP = offsetof(VexGuestPPC32State,guest_CIA),
+          .sizeof_IP = 4,
+
+          /* Describe any sections to be regarded by Memcheck as
+             'always-defined'. */
+          .n_alwaysDefd = 11,
+
+          .alwaysDefd 
+	  = { /*  0 */ ALWAYSDEFD32(guest_CIA),
+	      /*  1 */ ALWAYSDEFD32(guest_EMWARN),
+	      /*  2 */ ALWAYSDEFD32(guest_TISTART),
+	      /*  3 */ ALWAYSDEFD32(guest_TILEN),
+	      /*  4 */ ALWAYSDEFD32(guest_VSCR),
+	      /*  5 */ ALWAYSDEFD32(guest_FPROUND),
+              /*  6 */ ALWAYSDEFD32(guest_NRADDR),
+	      /*  7 */ ALWAYSDEFD32(guest_NRADDR_GPR2),
+	      /*  8 */ ALWAYSDEFD32(guest_REDIR_SP),
+	      /*  9 */ ALWAYSDEFD32(guest_REDIR_STACK),
+	      /* 10 */ ALWAYSDEFD32(guest_IP_AT_SYSCALL)
+            }
+        };
+
+#define ALWAYSDEFD64(field)                           \
+    { offsetof(VexGuestPPC64State, field),            \
+      (sizeof ((VexGuestPPC64State*)0)->field) }
+
+VexGuestLayout
+   ppc64Guest_layout 
+      = { 
+          /* Total size of the guest state, in bytes. */
+          .total_sizeB = sizeof(VexGuestPPC64State),
+
+          /* Describe the stack pointer. */
+          .offset_SP = offsetof(VexGuestPPC64State,guest_GPR1),
+          .sizeof_SP = 8,
+
+          /* Describe the frame pointer. */
+          .offset_FP = offsetof(VexGuestPPC64State,guest_GPR1),
+          .sizeof_FP = 8,
+
+          /* Describe the instruction pointer. */
+          .offset_IP = offsetof(VexGuestPPC64State,guest_CIA),
+          .sizeof_IP = 8,
+
+          /* Describe any sections to be regarded by Memcheck as
+             'always-defined'. */
+          .n_alwaysDefd = 11,
+
+          .alwaysDefd 
+	  = { /*  0 */ ALWAYSDEFD64(guest_CIA),
+	      /*  1 */ ALWAYSDEFD64(guest_EMWARN),
+	      /*  2 */ ALWAYSDEFD64(guest_TISTART),
+	      /*  3 */ ALWAYSDEFD64(guest_TILEN),
+	      /*  4 */ ALWAYSDEFD64(guest_VSCR),
+	      /*  5 */ ALWAYSDEFD64(guest_FPROUND),
+	      /*  6 */ ALWAYSDEFD64(guest_NRADDR),
+	      /*  7 */ ALWAYSDEFD64(guest_NRADDR_GPR2),
+	      /*  8 */ ALWAYSDEFD64(guest_REDIR_SP),
+	      /*  9 */ ALWAYSDEFD64(guest_REDIR_STACK),
+	      /* 10 */ ALWAYSDEFD64(guest_IP_AT_SYSCALL)
+            }
+        };
+
+/*---------------------------------------------------------------*/
+/*--- end                                 guest_ppc_helpers.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/guest_ppc_toIR.c b/VEX/priv/guest_ppc_toIR.c
new file mode 100644
index 0000000..f8d220d
--- /dev/null
+++ b/VEX/priv/guest_ppc_toIR.c

@@ -0,0 +1,10224 @@
+
+/*--------------------------------------------------------------------*/
+/*--- begin                                       guest_ppc_toIR.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+/* TODO 18/Nov/05:
+
+   Spot rld... cases which are simply left/right shifts and emit
+   Shl64/Shr64 accordingly.
+
+   Altivec
+   - datastream insns
+   - lvxl,stvxl: load/store with 'least recently used' hint
+   - vexptefp, vlogefp
+
+   LIMITATIONS:
+
+   Various, including:
+
+   - Some invalid forms of lswi and lswx are accepted when they should
+     not be.
+
+   - Floating Point:
+     - All exceptions disabled in FPSCR
+     - condition codes not set in FPSCR
+
+   - Altivec floating point:
+     - vmaddfp, vnmsubfp
+       Because we're using Java/IEEE mode (FPSCR[NJ]), rather than the
+       system default of Non-Java mode, we get some small errors
+       (lowest bit only).
+       This is because Non-Java mode brutally hacks denormalised results
+       to zero, whereas we keep maximum accuracy.  However, using
+       Non-Java mode would give us more inaccuracy, as our intermediate
+       results would then be zeroed, too.
+
+   - AbiHints for the stack red zone are only emitted for
+       unconditional calls and returns (bl, blr).  They should also be
+       emitted for conditional calls and returns, but we don't have a 
+       way to express that right now.  Ah well.
+*/
+
+/* "Special" instructions.
+
+   This instruction decoder can decode four special instructions
+   which mean nothing natively (are no-ops as far as regs/mem are
+   concerned) but have meaning for supporting Valgrind.  A special
+   instruction is flagged by a 16-byte preamble:
+
+      32-bit mode: 54001800 54006800 5400E800 54009800
+                   (rlwinm 0,0,3,0,0; rlwinm 0,0,13,0,0; 
+                    rlwinm 0,0,29,0,0; rlwinm 0,0,19,0,0)
+
+      64-bit mode: 78001800 78006800 7800E802 78009802
+                   (rotldi 0,0,3; rotldi 0,0,13;
+                    rotldi 0,0,61; rotldi 0,0,51)
+
+   Following that, one of the following 3 are allowed
+   (standard interpretation in parentheses):
+
+      7C210B78 (or 1,1,1)   %R3 = client_request ( %R4 )
+      7C421378 (or 2,2,2)   %R3 = guest_NRADDR
+      7C631B78 (or 3,3,3)   branch-and-link-to-noredir %R11
+      7C842378 (or 4,4,4)   %R3 = guest_NRADDR_GPR2
+
+   Any other bytes following the 16-byte preamble are illegal and
+   constitute a failure in instruction decoding.  This all assumes
+   that the preamble will never occur except in specific code
+   fragments designed for Valgrind to catch.
+*/
+
+
+/* Translates PPC32/64 code to IR. */
+
+/* References
+
+#define PPC32
+   "PowerPC Microprocessor Family:
+    The Programming Environments Manual for 32-Bit Microprocessors"
+    02/21/2000
+    http://www-3.ibm.com/chips/techlib/techlib.nsf/techdocs/852569B20050FF778525699600719DF2
+
+#define PPC64
+   "PowerPC Microprocessor Family:
+    Programming Environments Manual for 64-Bit Microprocessors"
+    06/10/2003
+   http://www-3.ibm.com/chips/techlib/techlib.nsf/techdocs/F7E732FF811F783187256FDD004D3797
+
+#define AV
+   "PowerPC Microprocessor Family:
+    AltiVec(TM) Technology Programming Environments Manual"
+    07/10/2003
+   http://www-3.ibm.com/chips/techlib/techlib.nsf/techdocs/FBFA164F824370F987256D6A006F424D
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+#include "libvex_guest_ppc32.h"
+#include "libvex_guest_ppc64.h"
+
+#include "main_util.h"
+#include "main_globals.h"
+#include "guest_generic_bb_to_IR.h"
+#include "guest_ppc_defs.h"
+
+
+/*------------------------------------------------------------*/
+/*--- Globals                                              ---*/
+/*------------------------------------------------------------*/
+
+/* These are set at the start of the translation of an insn, right
+   down in disInstr_PPC, so that we don't have to pass them around
+   endlessly.  They are all constant during the translation of any
+   given insn. */
+
+/* We need to know this to do sub-register accesses correctly. */
+static Bool host_is_bigendian;
+
+/* Pointer to the guest code area. */
+static UChar* guest_code;
+
+/* The guest address corresponding to guest_code[0]. */
+static Addr64 guest_CIA_bbstart;
+
+/* The guest address for the instruction currently being
+   translated. */
+static Addr64 guest_CIA_curr_instr;
+
+/* The IRSB* into which we're generating code. */
+static IRSB* irsb;
+
+/* Is our guest binary 32 or 64bit?  Set at each call to
+   disInstr_PPC below. */
+static Bool mode64 = False;
+
+// Given a pointer to a function as obtained by "& functionname" in C,
+// produce a pointer to the actual entry point for the function.  For
+// most platforms it's the identity function.  Unfortunately, on
+// ppc64-linux it isn't (sigh) and ditto for ppc32-aix5 and
+// ppc64-aix5.
+static void* fnptr_to_fnentry( VexAbiInfo* vbi, void* f )
+{
+   if (vbi->host_ppc_calls_use_fndescrs) {
+      /* f is a pointer to a 3-word function descriptor, of which the
+         first word is the entry address. */
+      /* note, this is correct even with cross-jitting, since this is
+         purely a host issue, not a guest one. */
+      HWord* fdescr = (HWord*)f;
+      return (void*)(fdescr[0]);
+   } else {
+      /* Simple; "& f" points directly at the code for f. */
+      return f;
+   }
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Debugging output                                     ---*/
+/*------------------------------------------------------------*/
+
+#define DIP(format, args...)           \
+   if (vex_traceflags & VEX_TRACE_FE)  \
+      vex_printf(format, ## args)
+
+#define DIS(buf, format, args...)      \
+   if (vex_traceflags & VEX_TRACE_FE)  \
+      vex_sprintf(buf, format, ## args)
+
+
+/*------------------------------------------------------------*/
+/*--- Offsets of various parts of the ppc32/64 guest state ---*/
+/*------------------------------------------------------------*/
+
+#define offsetofPPCGuestState(_x) \
+   (mode64 ? offsetof(VexGuestPPC64State, _x) : \
+             offsetof(VexGuestPPC32State, _x))
+
+#define OFFB_CIA         offsetofPPCGuestState(guest_CIA)
+#define OFFB_IP_AT_SYSCALL offsetofPPCGuestState(guest_IP_AT_SYSCALL)
+#define OFFB_SPRG3_RO    offsetofPPCGuestState(guest_SPRG3_RO)
+#define OFFB_LR          offsetofPPCGuestState(guest_LR)
+#define OFFB_CTR         offsetofPPCGuestState(guest_CTR)
+#define OFFB_XER_SO      offsetofPPCGuestState(guest_XER_SO)
+#define OFFB_XER_OV      offsetofPPCGuestState(guest_XER_OV)
+#define OFFB_XER_CA      offsetofPPCGuestState(guest_XER_CA)
+#define OFFB_XER_BC      offsetofPPCGuestState(guest_XER_BC)
+#define OFFB_FPROUND     offsetofPPCGuestState(guest_FPROUND)
+#define OFFB_VRSAVE      offsetofPPCGuestState(guest_VRSAVE)
+#define OFFB_VSCR        offsetofPPCGuestState(guest_VSCR)
+#define OFFB_EMWARN      offsetofPPCGuestState(guest_EMWARN)
+#define OFFB_TISTART     offsetofPPCGuestState(guest_TISTART)
+#define OFFB_TILEN       offsetofPPCGuestState(guest_TILEN)
+#define OFFB_NRADDR      offsetofPPCGuestState(guest_NRADDR)
+#define OFFB_NRADDR_GPR2 offsetofPPCGuestState(guest_NRADDR_GPR2)
+
+
+/*------------------------------------------------------------*/
+/*--- Extract instruction fields                          --- */
+/*------------------------------------------------------------*/
+
+/* Extract field from insn, given idx (zero = lsb) and field length */
+#define IFIELD( insn, idx, len ) ((insn >> idx) & ((1<<len)-1))
+
+/* Extract primary opcode, instr[31:26] */
+static UChar ifieldOPC( UInt instr ) {
+   return toUChar( IFIELD( instr, 26, 6 ) );
+}
+
+/* Extract 10-bit secondary opcode, instr[10:1] */
+static UInt ifieldOPClo10 ( UInt instr) {
+   return IFIELD( instr, 1, 10 );
+}
+
+/* Extract 9-bit secondary opcode, instr[9:1] */
+static UInt ifieldOPClo9 ( UInt instr) {
+   return IFIELD( instr, 1, 9 );
+}
+
+/* Extract 5-bit secondary opcode, instr[5:1] */
+static UInt ifieldOPClo5 ( UInt instr) {
+   return IFIELD( instr, 1, 5 );
+}
+
+/* Extract RD (destination register) field, instr[25:21] */
+static UChar ifieldRegDS( UInt instr ) {
+   return toUChar( IFIELD( instr, 21, 5 ) );
+}
+
+/* Extract RA (1st source register) field, instr[20:16] */
+static UChar ifieldRegA ( UInt instr ) {
+   return toUChar( IFIELD( instr, 16, 5 ) );
+}
+
+/* Extract RB (2nd source register) field, instr[15:11] */
+static UChar ifieldRegB ( UInt instr ) {
+   return toUChar( IFIELD( instr, 11, 5 ) );
+}
+
+/* Extract RC (3rd source register) field, instr[10:6] */
+static UChar ifieldRegC ( UInt instr ) {
+   return toUChar( IFIELD( instr, 6, 5 ) );
+}
+
+/* Extract 2nd lowest bit, instr[1] */
+static UChar ifieldBIT10 ( UInt instr ) {
+   return toUChar( IFIELD( instr, 10, 1 ) );
+}
+
+/* Extract 2nd lowest bit, instr[1] */
+static UChar ifieldBIT1 ( UInt instr ) {
+   return toUChar( IFIELD( instr, 1, 1 ) );
+}
+
+/* Extract lowest bit, instr[0] */
+static UChar ifieldBIT0 ( UInt instr ) {
+   return toUChar( instr & 0x1 );
+}
+
+/* Extract unsigned bottom half, instr[15:0] */
+static UInt ifieldUIMM16 ( UInt instr ) {
+   return instr & 0xFFFF;
+}
+
+/* Extract unsigned bottom 26 bits, instr[25:0] */
+static UInt ifieldUIMM26 ( UInt instr ) {
+   return instr & 0x3FFFFFF;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Guest-state identifiers                              ---*/
+/*------------------------------------------------------------*/
+
+typedef enum {
+    PPC_GST_CIA,    // Current Instruction Address
+    PPC_GST_LR,     // Link Register
+    PPC_GST_CTR,    // Count Register
+    PPC_GST_XER,    // Overflow, carry flags, byte count
+    PPC_GST_CR,     // Condition Register
+    PPC_GST_FPSCR,  // Floating Point Status/Control Register
+    PPC_GST_VRSAVE, // Vector Save/Restore Register
+    PPC_GST_VSCR,   // Vector Status and Control Register
+    PPC_GST_EMWARN, // Emulation warnings
+    PPC_GST_TISTART,// For icbi: start of area to invalidate
+    PPC_GST_TILEN,  // For icbi: length of area to invalidate
+    PPC_GST_IP_AT_SYSCALL, // the CIA of the most recently executed SC insn
+    PPC_GST_SPRG3_RO, // SPRG3
+    PPC_GST_MAX
+} PPC_GST;
+
+#define MASK_FPSCR_RN   0x3
+#define MASK_FPSCR_FPRF 0x1F000
+#define MASK_VSCR_VALID 0x00010001
+
+
+/*------------------------------------------------------------*/
+/*---  FP Helpers                                          ---*/
+/*------------------------------------------------------------*/
+
+/* Produce the 32-bit pattern corresponding to the supplied
+   float. */
+static UInt float_to_bits ( Float f )
+{
+   union { UInt i; Float f; } u;
+   vassert(4 == sizeof(UInt));
+   vassert(4 == sizeof(Float));
+   vassert(4 == sizeof(u));
+   u.f = f;
+   return u.i;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Misc Helpers                                         ---*/
+/*------------------------------------------------------------*/
+
+/* Generate mask with 1's from 'begin' through 'end',
+   wrapping if begin > end.
+   begin->end works from right to left, 0=lsb
+*/
+static UInt MASK32( UInt begin, UInt end )
+{
+   UInt m1, m2, mask;
+   vassert(begin < 32);
+   vassert(end < 32);
+   m1   = ((UInt)(-1)) << begin;
+   m2   = ((UInt)(-1)) << end << 1;
+   mask = m1 ^ m2;
+   if (begin > end) mask = ~mask;  // wrap mask
+   return mask;
+}
+
+/* ditto for 64bit mask */
+static ULong MASK64( UInt begin, UInt end )
+{
+   ULong m1, m2, mask;
+   vassert(begin < 64);
+   vassert(end < 64);
+   m1   = ((ULong)(-1)) << begin;
+   m2   = ((ULong)(-1)) << end << 1;
+   mask = m1 ^ m2;
+   if (begin > end) mask = ~mask;  // wrap mask
+   return mask;
+}
+
+static Addr64 nextInsnAddr( void )
+{
+   return guest_CIA_curr_instr + 4;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Helper bits and pieces for deconstructing the        ---*/
+/*--- ppc32/64 insn stream.                                ---*/
+/*------------------------------------------------------------*/
+
+/* Add a statement to the list held by "irsb". */
+static void stmt ( IRStmt* st )
+{
+   addStmtToIRSB( irsb, st );
+}
+
+/* Generate a new temporary of the given type. */
+static IRTemp newTemp ( IRType ty )
+{
+   vassert(isPlausibleIRType(ty));
+   return newIRTemp( irsb->tyenv, ty );
+}
+
+/* Various simple conversions */
+
+static UChar extend_s_5to8 ( UChar x )
+{
+   return toUChar((((Int)x) << 27) >> 27);
+}
+
+static UInt extend_s_8to32( UChar x )
+{
+   return (UInt)((((Int)x) << 24) >> 24);
+}
+
+static UInt extend_s_16to32 ( UInt x )
+{
+   return (UInt)((((Int)x) << 16) >> 16);
+}
+
+static ULong extend_s_16to64 ( UInt x )
+{
+   return (ULong)((((Long)x) << 48) >> 48);
+}
+
+static ULong extend_s_26to64 ( UInt x )
+{
+   return (ULong)((((Long)x) << 38) >> 38);
+}
+
+static ULong extend_s_32to64 ( UInt x )
+{
+   return (ULong)((((Long)x) << 32) >> 32);
+}
+
+/* Do a big-endian load of a 32-bit word, regardless of the endianness
+   of the underlying host. */
+static UInt getUIntBigendianly ( UChar* p )
+{
+   UInt w = 0;
+   w = (w << 8) | p[0];
+   w = (w << 8) | p[1];
+   w = (w << 8) | p[2];
+   w = (w << 8) | p[3];
+   return w;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for constructing IR.                         ---*/
+/*------------------------------------------------------------*/
+
+static void assign ( IRTemp dst, IRExpr* e )
+{
+   stmt( IRStmt_WrTmp(dst, e) );
+}
+
+/* This generates a normal (non store-conditional) store. */
+static void storeBE ( IRExpr* addr, IRExpr* data )
+{
+   IRType tyA = typeOfIRExpr(irsb->tyenv, addr);
+   vassert(tyA == Ity_I32 || tyA == Ity_I64);
+   stmt( IRStmt_Store(Iend_BE, addr, data) );
+}
+
+static IRExpr* unop ( IROp op, IRExpr* a )
+{
+   return IRExpr_Unop(op, a);
+}
+
+static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
+{
+   return IRExpr_Binop(op, a1, a2);
+}
+
+static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
+{
+   return IRExpr_Triop(op, a1, a2, a3);
+}
+
+static IRExpr* qop ( IROp op, IRExpr* a1, IRExpr* a2, 
+                              IRExpr* a3, IRExpr* a4 )
+{
+   return IRExpr_Qop(op, a1, a2, a3, a4);
+}
+
+static IRExpr* mkexpr ( IRTemp tmp )
+{
+   return IRExpr_RdTmp(tmp);
+}
+
+static IRExpr* mkU8 ( UChar i )
+{
+   return IRExpr_Const(IRConst_U8(i));
+}
+
+static IRExpr* mkU16 ( UInt i )
+{
+   return IRExpr_Const(IRConst_U16(i));
+}
+
+static IRExpr* mkU32 ( UInt i )
+{
+   return IRExpr_Const(IRConst_U32(i));
+}
+
+static IRExpr* mkU64 ( ULong i )
+{
+   return IRExpr_Const(IRConst_U64(i));
+}
+
+/* This generates a normal (non load-linked) load. */
+static IRExpr* loadBE ( IRType ty, IRExpr* addr )
+{
+   return IRExpr_Load(Iend_BE, ty, addr);
+}
+
+static IRExpr* mkOR1 ( IRExpr* arg1, IRExpr* arg2 )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, arg1) == Ity_I1);
+   vassert(typeOfIRExpr(irsb->tyenv, arg2) == Ity_I1);
+   return unop(Iop_32to1, binop(Iop_Or32, unop(Iop_1Uto32, arg1), 
+                                          unop(Iop_1Uto32, arg2)));
+}
+
+static IRExpr* mkAND1 ( IRExpr* arg1, IRExpr* arg2 )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, arg1) == Ity_I1);
+   vassert(typeOfIRExpr(irsb->tyenv, arg2) == Ity_I1);
+   return unop(Iop_32to1, binop(Iop_And32, unop(Iop_1Uto32, arg1), 
+                                           unop(Iop_1Uto32, arg2)));
+}
+
+/* expand V128_8Ux16 to 2x V128_16Ux8's */
+static void expand8Ux16( IRExpr* vIn,
+                         /*OUTs*/ IRTemp* vEvn, IRTemp* vOdd )
+{
+   IRTemp ones8x16 = newTemp(Ity_V128);
+
+   vassert(typeOfIRExpr(irsb->tyenv, vIn) == Ity_V128);
+   vassert(vEvn && *vEvn == IRTemp_INVALID);
+   vassert(vOdd && *vOdd == IRTemp_INVALID);
+   *vEvn = newTemp(Ity_V128);
+   *vOdd = newTemp(Ity_V128);
+
+   assign( ones8x16, unop(Iop_Dup8x16, mkU8(0x1)) );
+   assign( *vOdd, binop(Iop_MullEven8Ux16, mkexpr(ones8x16), vIn) );
+   assign( *vEvn, binop(Iop_MullEven8Ux16, mkexpr(ones8x16), 
+                        binop(Iop_ShrV128, vIn, mkU8(8))) );
+}
+
+/* expand V128_8Sx16 to 2x V128_16Sx8's */
+static void expand8Sx16( IRExpr* vIn,
+                         /*OUTs*/ IRTemp* vEvn, IRTemp* vOdd )
+{
+   IRTemp ones8x16 = newTemp(Ity_V128);
+
+   vassert(typeOfIRExpr(irsb->tyenv, vIn) == Ity_V128);
+   vassert(vEvn && *vEvn == IRTemp_INVALID);
+   vassert(vOdd && *vOdd == IRTemp_INVALID);
+   *vEvn = newTemp(Ity_V128);
+   *vOdd = newTemp(Ity_V128);
+
+   assign( ones8x16, unop(Iop_Dup8x16, mkU8(0x1)) );
+   assign( *vOdd, binop(Iop_MullEven8Sx16, mkexpr(ones8x16), vIn) );
+   assign( *vEvn, binop(Iop_MullEven8Sx16, mkexpr(ones8x16), 
+                        binop(Iop_ShrV128, vIn, mkU8(8))) );
+}
+
+/* expand V128_16Uto8 to 2x V128_32Ux4's */
+static void expand16Ux8( IRExpr* vIn,
+                         /*OUTs*/ IRTemp* vEvn, IRTemp* vOdd )
+{
+   IRTemp ones16x8 = newTemp(Ity_V128);
+
+   vassert(typeOfIRExpr(irsb->tyenv, vIn) == Ity_V128);
+   vassert(vEvn && *vEvn == IRTemp_INVALID);
+   vassert(vOdd && *vOdd == IRTemp_INVALID);
+   *vEvn = newTemp(Ity_V128);
+   *vOdd = newTemp(Ity_V128);
+
+   assign( ones16x8, unop(Iop_Dup16x8, mkU16(0x1)) );
+   assign( *vOdd, binop(Iop_MullEven16Ux8, mkexpr(ones16x8), vIn) );
+   assign( *vEvn, binop(Iop_MullEven16Ux8, mkexpr(ones16x8), 
+                        binop(Iop_ShrV128, vIn, mkU8(16))) );
+}
+
+/* expand V128_16Sto8 to 2x V128_32Sx4's */
+static void expand16Sx8( IRExpr* vIn,
+                         /*OUTs*/ IRTemp* vEvn, IRTemp* vOdd )
+{
+   IRTemp ones16x8 = newTemp(Ity_V128);
+
+   vassert(typeOfIRExpr(irsb->tyenv, vIn) == Ity_V128);
+   vassert(vEvn && *vEvn == IRTemp_INVALID);
+   vassert(vOdd && *vOdd == IRTemp_INVALID);
+   *vEvn = newTemp(Ity_V128);
+   *vOdd = newTemp(Ity_V128);
+
+   assign( ones16x8, unop(Iop_Dup16x8, mkU16(0x1)) );
+   assign( *vOdd, binop(Iop_MullEven16Sx8, mkexpr(ones16x8), vIn) );
+   assign( *vEvn, binop(Iop_MullEven16Sx8, mkexpr(ones16x8), 
+                       binop(Iop_ShrV128, vIn, mkU8(16))) );
+}
+
+/* break V128 to 4xI32's, then sign-extend to I64's */
+static void breakV128to4x64S( IRExpr* t128,
+                              /*OUTs*/
+                              IRTemp* t3, IRTemp* t2,
+                              IRTemp* t1, IRTemp* t0 )
+{
+   IRTemp hi64 = newTemp(Ity_I64);
+   IRTemp lo64 = newTemp(Ity_I64);
+
+   vassert(typeOfIRExpr(irsb->tyenv, t128) == Ity_V128);
+   vassert(t0 && *t0 == IRTemp_INVALID);
+   vassert(t1 && *t1 == IRTemp_INVALID);
+   vassert(t2 && *t2 == IRTemp_INVALID);
+   vassert(t3 && *t3 == IRTemp_INVALID);
+   *t0 = newTemp(Ity_I64);
+   *t1 = newTemp(Ity_I64);
+   *t2 = newTemp(Ity_I64);
+   *t3 = newTemp(Ity_I64);
+
+   assign( hi64, unop(Iop_V128HIto64, t128) );
+   assign( lo64, unop(Iop_V128to64,   t128) );
+   assign( *t3, unop(Iop_32Sto64, unop(Iop_64HIto32, mkexpr(hi64))) );
+   assign( *t2, unop(Iop_32Sto64, unop(Iop_64to32,   mkexpr(hi64))) );
+   assign( *t1, unop(Iop_32Sto64, unop(Iop_64HIto32, mkexpr(lo64))) );
+   assign( *t0, unop(Iop_32Sto64, unop(Iop_64to32,   mkexpr(lo64))) );
+}
+
+/* break V128 to 4xI32's, then zero-extend to I64's */
+static void breakV128to4x64U ( IRExpr* t128,
+                               /*OUTs*/
+                               IRTemp* t3, IRTemp* t2,
+                               IRTemp* t1, IRTemp* t0 )
+{
+   IRTemp hi64 = newTemp(Ity_I64);
+   IRTemp lo64 = newTemp(Ity_I64);
+
+   vassert(typeOfIRExpr(irsb->tyenv, t128) == Ity_V128);
+   vassert(t0 && *t0 == IRTemp_INVALID);
+   vassert(t1 && *t1 == IRTemp_INVALID);
+   vassert(t2 && *t2 == IRTemp_INVALID);
+   vassert(t3 && *t3 == IRTemp_INVALID);
+   *t0 = newTemp(Ity_I64);
+   *t1 = newTemp(Ity_I64);
+   *t2 = newTemp(Ity_I64);
+   *t3 = newTemp(Ity_I64);
+
+   assign( hi64, unop(Iop_V128HIto64, t128) );
+   assign( lo64, unop(Iop_V128to64,   t128) );
+   assign( *t3, unop(Iop_32Uto64, unop(Iop_64HIto32, mkexpr(hi64))) );
+   assign( *t2, unop(Iop_32Uto64, unop(Iop_64to32,   mkexpr(hi64))) );
+   assign( *t1, unop(Iop_32Uto64, unop(Iop_64HIto32, mkexpr(lo64))) );
+   assign( *t0, unop(Iop_32Uto64, unop(Iop_64to32,   mkexpr(lo64))) );
+}
+
+/* Signed saturating narrow 64S to 32 */
+static IRExpr* mkQNarrow64Sto32 ( IRExpr* t64 )
+{
+   IRTemp hi32 = newTemp(Ity_I32);
+   IRTemp lo32 = newTemp(Ity_I32);
+
+   vassert(typeOfIRExpr(irsb->tyenv, t64) == Ity_I64);
+
+   assign( hi32, unop(Iop_64HIto32, t64));
+   assign( lo32, unop(Iop_64to32,   t64));
+
+   return IRExpr_Mux0X(
+             /* if (hi32 == (lo32 >>s 31)) */
+             unop(Iop_1Uto8,
+                  binop(Iop_CmpEQ32, mkexpr(hi32),
+                        binop( Iop_Sar32, mkexpr(lo32), mkU8(31)))),
+             /* else: sign dep saturate: 1->0x80000000, 0->0x7FFFFFFF */
+             binop(Iop_Add32, mkU32(0x7FFFFFFF),
+                   binop(Iop_Shr32, mkexpr(hi32), mkU8(31))),
+             /* then: within signed-32 range: lo half good enough */
+             mkexpr(lo32) );
+}
+
+/* Unsigned saturating narrow 64S to 32 */
+static IRExpr* mkQNarrow64Uto32 ( IRExpr* t64 )
+{
+   IRTemp hi32 = newTemp(Ity_I32);
+   IRTemp lo32 = newTemp(Ity_I32);
+
+   vassert(typeOfIRExpr(irsb->tyenv, t64) == Ity_I64);
+
+   assign( hi32, unop(Iop_64HIto32, t64));
+   assign( lo32, unop(Iop_64to32,   t64));
+
+   return IRExpr_Mux0X(
+            /* if (top 32 bits of t64 are 0) */
+            unop(Iop_1Uto8, binop(Iop_CmpEQ32, mkexpr(hi32), mkU32(0))),
+            /* else: positive saturate -> 0xFFFFFFFF */
+            mkU32(0xFFFFFFFF),
+            /* then: within unsigned-32 range: lo half good enough */
+            mkexpr(lo32) );
+}
+
+/* Signed saturate narrow 64->32, combining to V128 */
+static IRExpr* mkV128from4x64S ( IRExpr* t3, IRExpr* t2,
+                                 IRExpr* t1, IRExpr* t0 )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, t3) == Ity_I64);
+   vassert(typeOfIRExpr(irsb->tyenv, t2) == Ity_I64);
+   vassert(typeOfIRExpr(irsb->tyenv, t1) == Ity_I64);
+   vassert(typeOfIRExpr(irsb->tyenv, t0) == Ity_I64);
+   return binop(Iop_64HLtoV128,
+                binop(Iop_32HLto64,
+                      mkQNarrow64Sto32( t3 ),
+                      mkQNarrow64Sto32( t2 )),
+                binop(Iop_32HLto64,
+                      mkQNarrow64Sto32( t1 ),
+                      mkQNarrow64Sto32( t0 )));
+}
+
+/* Unsigned saturate narrow 64->32, combining to V128 */
+static IRExpr* mkV128from4x64U ( IRExpr* t3, IRExpr* t2,
+                                 IRExpr* t1, IRExpr* t0 )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, t3) == Ity_I64);
+   vassert(typeOfIRExpr(irsb->tyenv, t2) == Ity_I64);
+   vassert(typeOfIRExpr(irsb->tyenv, t1) == Ity_I64);
+   vassert(typeOfIRExpr(irsb->tyenv, t0) == Ity_I64);
+   return binop(Iop_64HLtoV128,
+                binop(Iop_32HLto64,
+                      mkQNarrow64Uto32( t3 ),
+                      mkQNarrow64Uto32( t2 )),
+                binop(Iop_32HLto64,
+                      mkQNarrow64Uto32( t1 ),
+                      mkQNarrow64Uto32( t0 )));
+}
+
+/* Simulate irops Iop_MullOdd*, since we don't have them  */
+#define MK_Iop_MullOdd8Ux16( expr_vA, expr_vB ) \
+      binop(Iop_MullEven8Ux16, \
+            binop(Iop_ShrV128, expr_vA, mkU8(8)), \
+            binop(Iop_ShrV128, expr_vB, mkU8(8)))
+
+#define MK_Iop_MullOdd8Sx16( expr_vA, expr_vB ) \
+      binop(Iop_MullEven8Sx16, \
+            binop(Iop_ShrV128, expr_vA, mkU8(8)), \
+            binop(Iop_ShrV128, expr_vB, mkU8(8)))
+
+#define MK_Iop_MullOdd16Ux8( expr_vA, expr_vB ) \
+      binop(Iop_MullEven16Ux8, \
+            binop(Iop_ShrV128, expr_vA, mkU8(16)), \
+            binop(Iop_ShrV128, expr_vB, mkU8(16)))
+
+#define MK_Iop_MullOdd16Sx8( expr_vA, expr_vB ) \
+      binop(Iop_MullEven16Sx8, \
+            binop(Iop_ShrV128, expr_vA, mkU8(16)), \
+            binop(Iop_ShrV128, expr_vB, mkU8(16)))
+
+static IRExpr* /* :: Ity_I64 */ mk64lo32Sto64 ( IRExpr* src )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, src) == Ity_I64);
+   return unop(Iop_32Sto64, unop(Iop_64to32, src));
+}
+
+static IRExpr* /* :: Ity_I64 */ mk64lo32Uto64 ( IRExpr* src )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, src) == Ity_I64);
+   return unop(Iop_32Uto64, unop(Iop_64to32, src));
+}
+
+static IROp mkSzOp ( IRType ty, IROp op8 )
+{
+   Int adj;
+   vassert(ty == Ity_I8  || ty == Ity_I16 ||
+           ty == Ity_I32 || ty == Ity_I64);
+   vassert(op8 == Iop_Add8   || op8 == Iop_Sub8   || op8 == Iop_Mul8 ||
+           op8 == Iop_Or8    || op8 == Iop_And8   || op8 == Iop_Xor8 ||
+           op8 == Iop_Shl8   || op8 == Iop_Shr8   || op8 == Iop_Sar8 ||
+           op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8 ||
+           op8 == Iop_Not8 );
+   adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : (ty==Ity_I32 ? 2 : 3));
+   return adj + op8;
+}
+
+/* Make sure we get valid 32 and 64bit addresses */
+static Addr64 mkSzAddr ( IRType ty, Addr64 addr )
+{
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   return ( ty == Ity_I64 ?
+            (Addr64)addr :
+            (Addr64)extend_s_32to64( toUInt(addr) ) );
+}
+
+/* sz, ULong -> IRExpr */
+static IRExpr* mkSzImm ( IRType ty, ULong imm64 )
+{
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   return ty == Ity_I64 ? mkU64(imm64) : mkU32((UInt)imm64);
+}
+
+/* sz, ULong -> IRConst */
+static IRConst* mkSzConst ( IRType ty, ULong imm64 )
+{
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   return ( ty == Ity_I64 ?
+            IRConst_U64(imm64) :
+            IRConst_U32((UInt)imm64) );
+}
+
+/* Sign extend imm16 -> IRExpr* */
+static IRExpr* mkSzExtendS16 ( IRType ty, UInt imm16 )
+{
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   return ( ty == Ity_I64 ?
+            mkU64(extend_s_16to64(imm16)) :
+            mkU32(extend_s_16to32(imm16)) );
+}
+
+/* Sign extend imm32 -> IRExpr* */
+static IRExpr* mkSzExtendS32 ( IRType ty, UInt imm32 )
+{
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   return ( ty == Ity_I64 ?
+            mkU64(extend_s_32to64(imm32)) :
+            mkU32(imm32) );
+}
+
+/* IR narrows I32/I64 -> I8/I16/I32 */
+static IRExpr* mkNarrowTo8 ( IRType ty, IRExpr* src )
+{
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   return ty == Ity_I64 ? unop(Iop_64to8, src) : unop(Iop_32to8, src);
+}
+
+static IRExpr* mkNarrowTo16 ( IRType ty, IRExpr* src )
+{
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   return ty == Ity_I64 ? unop(Iop_64to16, src) : unop(Iop_32to16, src);
+}
+
+static IRExpr* mkNarrowTo32 ( IRType ty, IRExpr* src )
+{
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   return ty == Ity_I64 ? unop(Iop_64to32, src) : src;
+}
+
+/* Signed/Unsigned IR widens I8/I16/I32 -> I32/I64 */
+static IRExpr* mkWidenFrom8 ( IRType ty, IRExpr* src, Bool sined )
+{
+   IROp op;
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   if (sined) op = (ty==Ity_I32) ? Iop_8Sto32 : Iop_8Sto64;
+   else       op = (ty==Ity_I32) ? Iop_8Uto32 : Iop_8Uto64;
+   return unop(op, src);
+}
+
+static IRExpr* mkWidenFrom16 ( IRType ty, IRExpr* src, Bool sined )
+{
+   IROp op;
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   if (sined) op = (ty==Ity_I32) ? Iop_16Sto32 : Iop_16Sto64;
+   else       op = (ty==Ity_I32) ? Iop_16Uto32 : Iop_16Uto64;
+   return unop(op, src);
+}
+
+static IRExpr* mkWidenFrom32 ( IRType ty, IRExpr* src, Bool sined )
+{
+   vassert(ty == Ity_I32 || ty == Ity_I64);
+   if (ty == Ity_I32)
+      return src;
+   return (sined) ? unop(Iop_32Sto64, src) : unop(Iop_32Uto64, src);
+}
+
+
+static Int integerGuestRegOffset ( UInt archreg )
+{
+   vassert(archreg < 32);
+   
+   // jrs: probably not necessary; only matters if we reference sub-parts
+   // of the ppc registers, but that isn't the case
+   // later: this might affect Altivec though?
+   vassert(host_is_bigendian);
+
+   switch (archreg) {
+   case  0: return offsetofPPCGuestState(guest_GPR0);
+   case  1: return offsetofPPCGuestState(guest_GPR1);
+   case  2: return offsetofPPCGuestState(guest_GPR2);
+   case  3: return offsetofPPCGuestState(guest_GPR3);
+   case  4: return offsetofPPCGuestState(guest_GPR4);
+   case  5: return offsetofPPCGuestState(guest_GPR5);
+   case  6: return offsetofPPCGuestState(guest_GPR6);
+   case  7: return offsetofPPCGuestState(guest_GPR7);
+   case  8: return offsetofPPCGuestState(guest_GPR8);
+   case  9: return offsetofPPCGuestState(guest_GPR9);
+   case 10: return offsetofPPCGuestState(guest_GPR10);
+   case 11: return offsetofPPCGuestState(guest_GPR11);
+   case 12: return offsetofPPCGuestState(guest_GPR12);
+   case 13: return offsetofPPCGuestState(guest_GPR13);
+   case 14: return offsetofPPCGuestState(guest_GPR14);
+   case 15: return offsetofPPCGuestState(guest_GPR15);
+   case 16: return offsetofPPCGuestState(guest_GPR16);
+   case 17: return offsetofPPCGuestState(guest_GPR17);
+   case 18: return offsetofPPCGuestState(guest_GPR18);
+   case 19: return offsetofPPCGuestState(guest_GPR19);
+   case 20: return offsetofPPCGuestState(guest_GPR20);
+   case 21: return offsetofPPCGuestState(guest_GPR21);
+   case 22: return offsetofPPCGuestState(guest_GPR22);
+   case 23: return offsetofPPCGuestState(guest_GPR23);
+   case 24: return offsetofPPCGuestState(guest_GPR24);
+   case 25: return offsetofPPCGuestState(guest_GPR25);
+   case 26: return offsetofPPCGuestState(guest_GPR26);
+   case 27: return offsetofPPCGuestState(guest_GPR27);
+   case 28: return offsetofPPCGuestState(guest_GPR28);
+   case 29: return offsetofPPCGuestState(guest_GPR29);
+   case 30: return offsetofPPCGuestState(guest_GPR30);
+   case 31: return offsetofPPCGuestState(guest_GPR31);
+   default: break;
+   }
+   vpanic("integerGuestRegOffset(ppc,be)"); /*notreached*/
+}
+
+static IRExpr* getIReg ( UInt archreg )
+{
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   vassert(archreg < 32);
+   return IRExpr_Get( integerGuestRegOffset(archreg), ty );
+}
+
+/* Ditto, but write to a reg instead. */
+static void putIReg ( UInt archreg, IRExpr* e )
+{
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   vassert(archreg < 32);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == ty );
+   stmt( IRStmt_Put(integerGuestRegOffset(archreg), e) );
+}
+
+
+static Int floatGuestRegOffset ( UInt archreg )
+{
+   vassert(archreg < 32);
+   
+   switch (archreg) {
+   case  0: return offsetofPPCGuestState(guest_FPR0);
+   case  1: return offsetofPPCGuestState(guest_FPR1);
+   case  2: return offsetofPPCGuestState(guest_FPR2);
+   case  3: return offsetofPPCGuestState(guest_FPR3);
+   case  4: return offsetofPPCGuestState(guest_FPR4);
+   case  5: return offsetofPPCGuestState(guest_FPR5);
+   case  6: return offsetofPPCGuestState(guest_FPR6);
+   case  7: return offsetofPPCGuestState(guest_FPR7);
+   case  8: return offsetofPPCGuestState(guest_FPR8);
+   case  9: return offsetofPPCGuestState(guest_FPR9);
+   case 10: return offsetofPPCGuestState(guest_FPR10);
+   case 11: return offsetofPPCGuestState(guest_FPR11);
+   case 12: return offsetofPPCGuestState(guest_FPR12);
+   case 13: return offsetofPPCGuestState(guest_FPR13);
+   case 14: return offsetofPPCGuestState(guest_FPR14);
+   case 15: return offsetofPPCGuestState(guest_FPR15);
+   case 16: return offsetofPPCGuestState(guest_FPR16);
+   case 17: return offsetofPPCGuestState(guest_FPR17);
+   case 18: return offsetofPPCGuestState(guest_FPR18);
+   case 19: return offsetofPPCGuestState(guest_FPR19);
+   case 20: return offsetofPPCGuestState(guest_FPR20);
+   case 21: return offsetofPPCGuestState(guest_FPR21);
+   case 22: return offsetofPPCGuestState(guest_FPR22);
+   case 23: return offsetofPPCGuestState(guest_FPR23);
+   case 24: return offsetofPPCGuestState(guest_FPR24);
+   case 25: return offsetofPPCGuestState(guest_FPR25);
+   case 26: return offsetofPPCGuestState(guest_FPR26);
+   case 27: return offsetofPPCGuestState(guest_FPR27);
+   case 28: return offsetofPPCGuestState(guest_FPR28);
+   case 29: return offsetofPPCGuestState(guest_FPR29);
+   case 30: return offsetofPPCGuestState(guest_FPR30);
+   case 31: return offsetofPPCGuestState(guest_FPR31);
+   default: break;
+   }
+   vpanic("floatGuestRegOffset(ppc)"); /*notreached*/
+}
+
+static IRExpr* getFReg ( UInt archreg )
+{
+   vassert(archreg < 32);
+   return IRExpr_Get( floatGuestRegOffset(archreg), Ity_F64 );
+}
+
+/* Ditto, but write to a reg instead. */
+static void putFReg ( UInt archreg, IRExpr* e )
+{
+   vassert(archreg < 32);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_F64);
+   stmt( IRStmt_Put(floatGuestRegOffset(archreg), e) );
+}
+
+
+static Int vectorGuestRegOffset ( UInt archreg )
+{
+   vassert(archreg < 32);
+   
+   switch (archreg) {
+   case  0: return offsetofPPCGuestState(guest_VR0);
+   case  1: return offsetofPPCGuestState(guest_VR1);
+   case  2: return offsetofPPCGuestState(guest_VR2);
+   case  3: return offsetofPPCGuestState(guest_VR3);
+   case  4: return offsetofPPCGuestState(guest_VR4);
+   case  5: return offsetofPPCGuestState(guest_VR5);
+   case  6: return offsetofPPCGuestState(guest_VR6);
+   case  7: return offsetofPPCGuestState(guest_VR7);
+   case  8: return offsetofPPCGuestState(guest_VR8);
+   case  9: return offsetofPPCGuestState(guest_VR9);
+   case 10: return offsetofPPCGuestState(guest_VR10);
+   case 11: return offsetofPPCGuestState(guest_VR11);
+   case 12: return offsetofPPCGuestState(guest_VR12);
+   case 13: return offsetofPPCGuestState(guest_VR13);
+   case 14: return offsetofPPCGuestState(guest_VR14);
+   case 15: return offsetofPPCGuestState(guest_VR15);
+   case 16: return offsetofPPCGuestState(guest_VR16);
+   case 17: return offsetofPPCGuestState(guest_VR17);
+   case 18: return offsetofPPCGuestState(guest_VR18);
+   case 19: return offsetofPPCGuestState(guest_VR19);
+   case 20: return offsetofPPCGuestState(guest_VR20);
+   case 21: return offsetofPPCGuestState(guest_VR21);
+   case 22: return offsetofPPCGuestState(guest_VR22);
+   case 23: return offsetofPPCGuestState(guest_VR23);
+   case 24: return offsetofPPCGuestState(guest_VR24);
+   case 25: return offsetofPPCGuestState(guest_VR25);
+   case 26: return offsetofPPCGuestState(guest_VR26);
+   case 27: return offsetofPPCGuestState(guest_VR27);
+   case 28: return offsetofPPCGuestState(guest_VR28);
+   case 29: return offsetofPPCGuestState(guest_VR29);
+   case 30: return offsetofPPCGuestState(guest_VR30);
+   case 31: return offsetofPPCGuestState(guest_VR31);
+   default: break;
+   }
+   vpanic("vextorGuestRegOffset(ppc)"); /*notreached*/
+}
+
+static IRExpr* getVReg ( UInt archreg )
+{
+   vassert(archreg < 32);
+   return IRExpr_Get( vectorGuestRegOffset(archreg), Ity_V128 );
+}
+
+/* Ditto, but write to a reg instead. */
+static void putVReg ( UInt archreg, IRExpr* e )
+{
+   vassert(archreg < 32);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_V128);
+   stmt( IRStmt_Put(vectorGuestRegOffset(archreg), e) );
+}
+
+static Int guestCR321offset ( UInt cr )
+{
+   switch (cr) {
+   case 0: return offsetofPPCGuestState(guest_CR0_321 );
+   case 1: return offsetofPPCGuestState(guest_CR1_321 );
+   case 2: return offsetofPPCGuestState(guest_CR2_321 );
+   case 3: return offsetofPPCGuestState(guest_CR3_321 );
+   case 4: return offsetofPPCGuestState(guest_CR4_321 );
+   case 5: return offsetofPPCGuestState(guest_CR5_321 );
+   case 6: return offsetofPPCGuestState(guest_CR6_321 );
+   case 7: return offsetofPPCGuestState(guest_CR7_321 );
+   default: vpanic("guestCR321offset(ppc)");
+   }
+} 
+
+static Int guestCR0offset ( UInt cr )
+{
+   switch (cr) {
+   case 0: return offsetofPPCGuestState(guest_CR0_0 );
+   case 1: return offsetofPPCGuestState(guest_CR1_0 );
+   case 2: return offsetofPPCGuestState(guest_CR2_0 );
+   case 3: return offsetofPPCGuestState(guest_CR3_0 );
+   case 4: return offsetofPPCGuestState(guest_CR4_0 );
+   case 5: return offsetofPPCGuestState(guest_CR5_0 );
+   case 6: return offsetofPPCGuestState(guest_CR6_0 );
+   case 7: return offsetofPPCGuestState(guest_CR7_0 );
+   default: vpanic("guestCR3offset(ppc)");
+   }
+}
+
+// ROTL(src32/64, rot_amt5/6)
+static IRExpr* /* :: Ity_I32/64 */ ROTL ( IRExpr* src,
+                                          IRExpr* rot_amt )
+{
+   IRExpr *mask, *rot;
+   vassert(typeOfIRExpr(irsb->tyenv,rot_amt) == Ity_I8);
+
+   if (typeOfIRExpr(irsb->tyenv,src) == Ity_I64) {
+      // rot = (src << rot_amt) | (src >> (64-rot_amt))
+      mask = binop(Iop_And8, rot_amt, mkU8(63));
+      rot  = binop(Iop_Or64,
+                binop(Iop_Shl64, src, mask),
+                binop(Iop_Shr64, src, binop(Iop_Sub8, mkU8(64), mask)));
+   } else {
+      // rot = (src << rot_amt) | (src >> (32-rot_amt))
+      mask = binop(Iop_And8, rot_amt, mkU8(31));
+      rot  = binop(Iop_Or32,
+                binop(Iop_Shl32, src, mask),
+                binop(Iop_Shr32, src, binop(Iop_Sub8, mkU8(32), mask)));
+   }
+   /* Note: the MuxOX is not merely an optimisation; it's needed
+      because otherwise the Shr is a shift by the word size when
+      mask denotes zero.  For rotates by immediates, a lot of
+      this junk gets folded out. */
+   return IRExpr_Mux0X( mask, /*     zero rotate */ src,
+                              /* non-zero rotate */ rot );
+}
+
+/* Standard effective address calc: (rA + rB) */
+static IRExpr* ea_rA_idxd ( UInt rA, UInt rB )
+{
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   vassert(rA < 32);
+   vassert(rB < 32);
+   return binop(mkSzOp(ty, Iop_Add8), getIReg(rA), getIReg(rB));
+}
+
+/* Standard effective address calc: (rA + simm) */
+static IRExpr* ea_rA_simm ( UInt rA, UInt simm16 )
+{
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   vassert(rA < 32);
+   return binop(mkSzOp(ty, Iop_Add8), getIReg(rA),
+                mkSzExtendS16(ty, simm16));
+}
+
+/* Standard effective address calc: (rA|0) */
+static IRExpr* ea_rAor0 ( UInt rA )
+{
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   vassert(rA < 32);
+   if (rA == 0) {
+      return mkSzImm(ty, 0);
+   } else {
+      return getIReg(rA);
+   }
+}
+
+/* Standard effective address calc: (rA|0) + rB */
+static IRExpr* ea_rAor0_idxd ( UInt rA, UInt rB )
+{
+   vassert(rA < 32);
+   vassert(rB < 32);
+   return (rA == 0) ? getIReg(rB) : ea_rA_idxd( rA, rB );
+}
+
+/* Standard effective address calc: (rA|0) + simm16 */
+static IRExpr* ea_rAor0_simm ( UInt rA, UInt simm16 )
+{
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   vassert(rA < 32);
+   if (rA == 0) {
+      return mkSzExtendS16(ty, simm16);
+   } else {
+      return ea_rA_simm( rA, simm16 );
+   }
+}
+
+
+/* Align effective address */
+static IRExpr* addr_align( IRExpr* addr, UChar align )
+{
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   Long mask;
+   switch (align) {
+   case 1:  return addr;                    // byte aligned
+   case 2:  mask = ((Long)-1) << 1; break;  // half-word aligned
+   case 4:  mask = ((Long)-1) << 2; break;  // word aligned
+   case 16: mask = ((Long)-1) << 4; break;  // quad-word aligned
+   default:
+      vex_printf("addr_align: align = %u\n", align);
+      vpanic("addr_align(ppc)");
+   }
+
+   vassert(typeOfIRExpr(irsb->tyenv,addr) == ty);
+   return binop( mkSzOp(ty, Iop_And8), addr, mkSzImm(ty, mask) );
+}
+
+
+/* Exit the trace if ADDR (intended to be a guest memory address) is
+   not ALIGN-aligned, generating a request for a SIGBUS followed by a
+   restart of the current insn. */
+static void gen_SIGBUS_if_misaligned ( IRTemp addr, UChar align )
+{
+   vassert(align == 4 || align == 8);
+   if (mode64) {
+      vassert(typeOfIRTemp(irsb->tyenv, addr) == Ity_I64);
+      stmt(
+         IRStmt_Exit(
+            binop(Iop_CmpNE64,
+                  binop(Iop_And64, mkexpr(addr), mkU64(align-1)),
+                  mkU64(0)),
+            Ijk_SigBUS,
+            IRConst_U64( guest_CIA_curr_instr )
+         )
+      );
+   } else {
+      vassert(typeOfIRTemp(irsb->tyenv, addr) == Ity_I32);
+      stmt(
+         IRStmt_Exit(
+            binop(Iop_CmpNE32,
+                  binop(Iop_And32, mkexpr(addr), mkU32(align-1)),
+                  mkU32(0)),
+            Ijk_SigBUS,
+            IRConst_U32( guest_CIA_curr_instr )
+         )
+      );
+   }
+}
+
+
+/* Generate AbiHints which mark points at which the ELF or PowerOpen
+   ABIs say that the stack red zone (viz, -N(r1) .. -1(r1), for some
+   N) becomes undefined.  That is at function calls and returns.  ELF
+   ppc32 doesn't have this "feature" (how fortunate for it).  nia is
+   the address of the next instruction to be executed.
+*/
+static void make_redzone_AbiHint ( VexAbiInfo* vbi, 
+                                   IRTemp nia, HChar* who )
+{
+   Int szB = vbi->guest_stack_redzone_size;
+   if (0) vex_printf("AbiHint: %s\n", who);
+   vassert(szB >= 0);
+   if (szB > 0) {
+      if (mode64) {
+         vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
+         stmt( IRStmt_AbiHint( 
+                  binop(Iop_Sub64, getIReg(1), mkU64(szB)), 
+                  szB,
+                  mkexpr(nia)
+         ));
+      } else {
+         vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I32);
+         stmt( IRStmt_AbiHint( 
+                  binop(Iop_Sub32, getIReg(1), mkU32(szB)), 
+                  szB,
+                  mkexpr(nia)
+         ));
+      }
+   }
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for condition codes.                         ---*/
+/*------------------------------------------------------------*/
+
+/* Condition register layout. 
+
+   In the hardware, CR is laid out like this.  The leftmost end is the
+   most significant bit in the register; however the IBM documentation
+   numbers the bits backwards for some reason.
+
+   CR0      CR1    ..........   CR6       CR7
+   0 .. 3   .......................  28 .. 31    (IBM bit numbering)
+   31  28                             3    0     (normal bit numbering)
+
+   Each CR field is 4 bits:  [<,>,==,SO]
+
+   Hence in IBM's notation, BI=0 is CR7[SO], BI=1 is CR7[==], etc.
+
+   Indexing from BI to guest state:
+
+     let    n = BI / 4
+          off = BI % 4
+     this references CR n:
+
+        off==0   ->  guest_CRn_321 >> 3
+        off==1   ->  guest_CRn_321 >> 2
+        off==2   ->  guest_CRn_321 >> 1
+        off==3   ->  guest_CRn_SO
+
+   Bear in mind the only significant bit in guest_CRn_SO is bit 0
+   (normal notation) and in guest_CRn_321 the significant bits are
+   3, 2 and 1 (normal notation).
+*/
+
+static void putCR321 ( UInt cr, IRExpr* e )
+{
+   vassert(cr < 8);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
+   stmt( IRStmt_Put(guestCR321offset(cr), e) );
+}
+
+static void putCR0 ( UInt cr, IRExpr* e )
+{
+   vassert(cr < 8);
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
+   stmt( IRStmt_Put(guestCR0offset(cr), e) );
+}
+
+static IRExpr* /* :: Ity_I8 */ getCR0 ( UInt cr )
+{
+   vassert(cr < 8);
+   return IRExpr_Get(guestCR0offset(cr), Ity_I8);
+}
+
+static IRExpr* /* :: Ity_I8 */ getCR321 ( UInt cr )
+{
+   vassert(cr < 8);
+   return IRExpr_Get(guestCR321offset(cr), Ity_I8);
+}
+
+/* Fetch the specified CR bit (as per IBM/hardware notation) and
+   return it at the bottom of an I32; the top 31 bits are guaranteed
+   to be zero. */
+static IRExpr* /* :: Ity_I32 */ getCRbit ( UInt bi )
+{
+   UInt n   = bi / 4;
+   UInt off = bi % 4;
+   vassert(bi < 32);
+   if (off == 3) {
+      /* Fetch the SO bit for this CR field */
+      /* Note: And32 is redundant paranoia iff guest state only has 0
+         or 1 in that slot. */
+      return binop(Iop_And32, unop(Iop_8Uto32, getCR0(n)), mkU32(1));
+   } else {
+      /* Fetch the <, > or == bit for this CR field */
+      return binop( Iop_And32, 
+                    binop( Iop_Shr32, 
+                           unop(Iop_8Uto32, getCR321(n)),
+                           mkU8(toUChar(3-off)) ),
+                    mkU32(1) );
+   }
+}
+
+/* Dually, write the least significant bit of BIT to the specified CR
+   bit.  Indexing as per getCRbit. */
+static void putCRbit ( UInt bi, IRExpr* bit )
+{
+   UInt    n, off;
+   IRExpr* safe;
+   vassert(typeOfIRExpr(irsb->tyenv,bit) == Ity_I32);
+   safe = binop(Iop_And32, bit, mkU32(1));
+   n   = bi / 4;
+   off = bi % 4;
+   vassert(bi < 32);
+   if (off == 3) {
+      /* This is the SO bit for this CR field */
+      putCR0(n, unop(Iop_32to8, safe));
+   } else {
+      off = 3 - off;
+      vassert(off == 1 || off == 2 || off == 3);
+      putCR321(
+         n,
+         unop( Iop_32to8,
+               binop( Iop_Or32,
+                      /* old value with field masked out */
+                      binop(Iop_And32, unop(Iop_8Uto32, getCR321(n)),
+                                       mkU32(~(1 << off))),
+                      /* new value in the right place */
+                      binop(Iop_Shl32, safe, mkU8(toUChar(off)))
+               )
+         )
+      );
+   }
+}
+
+/* Fetch the specified CR bit (as per IBM/hardware notation) and
+   return it somewhere in an I32; it does not matter where, but
+   whichever bit it is, all other bits are guaranteed to be zero.  In
+   other words, the I32-typed expression will be zero if the bit is
+   zero and nonzero if the bit is 1.  Write into *where the index
+   of where the bit will be. */
+
+static
+IRExpr* /* :: Ity_I32 */ getCRbit_anywhere ( UInt bi, Int* where )
+{
+   UInt n   = bi / 4;
+   UInt off = bi % 4;
+   vassert(bi < 32);
+   if (off == 3) {
+      /* Fetch the SO bit for this CR field */
+      /* Note: And32 is redundant paranoia iff guest state only has 0
+         or 1 in that slot. */
+      *where = 0;
+      return binop(Iop_And32, unop(Iop_8Uto32, getCR0(n)), mkU32(1));
+   } else {
+      /* Fetch the <, > or == bit for this CR field */
+      *where = 3-off;
+      return binop( Iop_And32, 
+                    unop(Iop_8Uto32, getCR321(n)),
+                    mkU32(1 << (3-off)) );
+   }
+}
+
+/* Set the CR0 flags following an arithmetic operation.
+   (Condition Register CR0 Field Definition, PPC32 p60)
+*/
+static IRExpr* getXER_SO ( void );
+static void set_CR0 ( IRExpr* result )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,result) == Ity_I32 ||
+           typeOfIRExpr(irsb->tyenv,result) == Ity_I64);
+   if (mode64) {
+      putCR321( 0, unop(Iop_64to8,
+                        binop(Iop_CmpORD64S, result, mkU64(0))) );
+   } else {
+      putCR321( 0, unop(Iop_32to8,
+                        binop(Iop_CmpORD32S, result, mkU32(0))) );
+   }
+   putCR0( 0, getXER_SO() );
+}
+
+
+/* Set the CR6 flags following an AltiVec compare operation. */
+static void set_AV_CR6 ( IRExpr* result, Bool test_all_ones )
+{
+   /* CR6[0:3] = {all_ones, 0, all_zeros, 0}
+      all_ones  = (v[0] && v[1] && v[2] && v[3])
+      all_zeros = ~(v[0] || v[1] || v[2] || v[3])
+   */
+   IRTemp v0 = newTemp(Ity_V128);
+   IRTemp v1 = newTemp(Ity_V128);
+   IRTemp v2 = newTemp(Ity_V128);
+   IRTemp v3 = newTemp(Ity_V128);
+   IRTemp rOnes  = newTemp(Ity_I8);
+   IRTemp rZeros = newTemp(Ity_I8);
+
+   vassert(typeOfIRExpr(irsb->tyenv,result) == Ity_V128);
+
+   assign( v0, result );
+   assign( v1, binop(Iop_ShrV128, result, mkU8(32)) );
+   assign( v2, binop(Iop_ShrV128, result, mkU8(64)) );
+   assign( v3, binop(Iop_ShrV128, result, mkU8(96)) );
+
+   assign( rZeros, unop(Iop_1Uto8,
+       binop(Iop_CmpEQ32, mkU32(0xFFFFFFFF),
+             unop(Iop_Not32,
+                  unop(Iop_V128to32,
+                       binop(Iop_OrV128,
+                             binop(Iop_OrV128, mkexpr(v0), mkexpr(v1)),
+                             binop(Iop_OrV128, mkexpr(v2), mkexpr(v3))))
+                  ))) );
+
+   if (test_all_ones) {
+      assign( rOnes, unop(Iop_1Uto8,
+         binop(Iop_CmpEQ32, mkU32(0xFFFFFFFF),
+               unop(Iop_V128to32,
+                    binop(Iop_AndV128,
+                          binop(Iop_AndV128, mkexpr(v0), mkexpr(v1)),
+                          binop(Iop_AndV128, mkexpr(v2), mkexpr(v3)))
+                    ))) );
+      putCR321( 6, binop(Iop_Or8,
+                         binop(Iop_Shl8, mkexpr(rOnes),  mkU8(3)),
+                         binop(Iop_Shl8, mkexpr(rZeros), mkU8(1))) );
+   } else {
+      putCR321( 6, binop(Iop_Shl8, mkexpr(rZeros), mkU8(1)) );
+   }
+   putCR0( 6, mkU8(0) );
+} 
+
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for XER flags.                               ---*/
+/*------------------------------------------------------------*/
+
+static void putXER_SO ( IRExpr* e )
+{
+   IRExpr* so;
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
+   so = binop(Iop_And8, e, mkU8(1));
+   stmt( IRStmt_Put( OFFB_XER_SO, so ) );
+}
+
+static void putXER_OV ( IRExpr* e )
+{
+   IRExpr* ov;
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
+   ov = binop(Iop_And8, e, mkU8(1));
+   stmt( IRStmt_Put( OFFB_XER_OV, ov ) );
+}
+
+static void putXER_CA ( IRExpr* e )
+{
+   IRExpr* ca;
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
+   ca = binop(Iop_And8, e, mkU8(1));
+   stmt( IRStmt_Put( OFFB_XER_CA, ca ) );
+}
+
+static void putXER_BC ( IRExpr* e )
+{
+   IRExpr* bc;
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
+   bc = binop(Iop_And8, e, mkU8(0x7F));
+   stmt( IRStmt_Put( OFFB_XER_BC, bc ) );
+}
+
+static IRExpr* /* :: Ity_I8 */ getXER_SO ( void )
+{
+   return IRExpr_Get( OFFB_XER_SO, Ity_I8 );
+}
+
+static IRExpr* /* :: Ity_I32 */ getXER_SO32 ( void )
+{
+   return binop( Iop_And32, unop(Iop_8Uto32, getXER_SO()), mkU32(1) );
+}
+
+static IRExpr* /* :: Ity_I8 */ getXER_OV ( void )
+{
+   return IRExpr_Get( OFFB_XER_OV, Ity_I8 );
+}
+
+static IRExpr* /* :: Ity_I32 */ getXER_OV32 ( void )
+{
+   return binop( Iop_And32, unop(Iop_8Uto32, getXER_OV()), mkU32(1) );
+}
+
+static IRExpr* /* :: Ity_I32 */ getXER_CA32 ( void )
+{
+   IRExpr* ca = IRExpr_Get( OFFB_XER_CA, Ity_I8 );
+   return binop( Iop_And32, unop(Iop_8Uto32, ca ), mkU32(1) );
+}
+
+static IRExpr* /* :: Ity_I8 */ getXER_BC ( void )
+{
+   return IRExpr_Get( OFFB_XER_BC, Ity_I8 );
+}
+
+static IRExpr* /* :: Ity_I32 */ getXER_BC32 ( void )
+{
+   IRExpr* bc = IRExpr_Get( OFFB_XER_BC, Ity_I8 );
+   return binop( Iop_And32, unop(Iop_8Uto32, bc), mkU32(0x7F) );
+}
+
+
+/* RES is the result of doing OP on ARGL and ARGR.  Set %XER.OV and
+   %XER.SO accordingly. */
+
+static void set_XER_OV_32( UInt op, IRExpr* res,
+                           IRExpr* argL, IRExpr* argR )
+{
+   IRTemp  t64;
+   IRExpr* xer_ov;
+   vassert(op < PPCG_FLAG_OP_NUMBER);
+   vassert(typeOfIRExpr(irsb->tyenv,res)  == Ity_I32);
+   vassert(typeOfIRExpr(irsb->tyenv,argL) == Ity_I32);
+   vassert(typeOfIRExpr(irsb->tyenv,argR) == Ity_I32);
+
+#  define INT32_MIN 0x80000000
+
+#  define XOR2(_aa,_bb) \
+      binop(Iop_Xor32,(_aa),(_bb))
+
+#  define XOR3(_cc,_dd,_ee) \
+      binop(Iop_Xor32,binop(Iop_Xor32,(_cc),(_dd)),(_ee))
+
+#  define AND3(_ff,_gg,_hh) \
+      binop(Iop_And32,binop(Iop_And32,(_ff),(_gg)),(_hh))
+
+#define NOT(_jj) \
+      unop(Iop_Not32, (_jj))
+
+   switch (op) {
+   case /* 0  */ PPCG_FLAG_OP_ADD:
+   case /* 1  */ PPCG_FLAG_OP_ADDE:
+      /* (argL^argR^-1) & (argL^res) & (1<<31)  ?1:0 */
+      // i.e. ((both_same_sign) & (sign_changed) & (sign_mask))
+      xer_ov 
+         = AND3( XOR3(argL,argR,mkU32(-1)),
+                 XOR2(argL,res),
+                 mkU32(INT32_MIN) );
+      /* xer_ov can only be 0 or 1<<31 */
+      xer_ov 
+         = binop(Iop_Shr32, xer_ov, mkU8(31) );
+      break;
+      
+   case /* 2  */ PPCG_FLAG_OP_DIVW:
+      /* (argL == INT32_MIN && argR == -1) || argR == 0 */
+      xer_ov
+         = mkOR1(
+              mkAND1( 
+                 binop(Iop_CmpEQ32, argL, mkU32(INT32_MIN)),
+                 binop(Iop_CmpEQ32, argR, mkU32(-1)) 
+              ),
+              binop(Iop_CmpEQ32, argR, mkU32(0) ) 
+           );
+      xer_ov 
+         = unop(Iop_1Uto32, xer_ov);
+      break;
+      
+   case /* 3  */ PPCG_FLAG_OP_DIVWU:
+      /* argR == 0 */
+      xer_ov 
+         = unop(Iop_1Uto32, binop(Iop_CmpEQ32, argR, mkU32(0)));
+      break;
+      
+   case /* 4  */ PPCG_FLAG_OP_MULLW:
+      /* OV true if result can't be represented in 32 bits
+         i.e sHi != sign extension of sLo */
+      t64 = newTemp(Ity_I64);
+      assign( t64, binop(Iop_MullS32, argL, argR) );
+      xer_ov 
+         = binop( Iop_CmpNE32,
+                  unop(Iop_64HIto32, mkexpr(t64)),
+                  binop( Iop_Sar32, 
+                         unop(Iop_64to32, mkexpr(t64)), 
+                         mkU8(31))
+                  );
+      xer_ov
+         = unop(Iop_1Uto32, xer_ov);
+      break;
+      
+   case /* 5  */ PPCG_FLAG_OP_NEG:
+      /* argL == INT32_MIN */
+      xer_ov
+         = unop( Iop_1Uto32, 
+                 binop(Iop_CmpEQ32, argL, mkU32(INT32_MIN)) );
+      break;
+      
+   case /* 6  */ PPCG_FLAG_OP_SUBF:
+   case /* 7  */ PPCG_FLAG_OP_SUBFC:
+   case /* 8  */ PPCG_FLAG_OP_SUBFE:
+      /* ((~argL)^argR^-1) & ((~argL)^res) & (1<<31) ?1:0; */
+      xer_ov 
+         = AND3( XOR3(NOT(argL),argR,mkU32(-1)),
+                 XOR2(NOT(argL),res),
+                 mkU32(INT32_MIN) );
+      /* xer_ov can only be 0 or 1<<31 */
+      xer_ov 
+         = binop(Iop_Shr32, xer_ov, mkU8(31) );
+      break;
+      
+   default: 
+      vex_printf("set_XER_OV: op = %u\n", op);
+      vpanic("set_XER_OV(ppc)");
+   }
+   
+   /* xer_ov MUST denote either 0 or 1, no other value allowed */
+   putXER_OV( unop(Iop_32to8, xer_ov) );
+
+   /* Update the summary overflow */
+   putXER_SO( binop(Iop_Or8, getXER_SO(), getXER_OV()) );
+
+#  undef INT32_MIN
+#  undef AND3
+#  undef XOR3
+#  undef XOR2
+#  undef NOT
+}
+
+static void set_XER_OV_64( UInt op, IRExpr* res,
+                           IRExpr* argL, IRExpr* argR )
+{
+   IRExpr* xer_ov;
+   vassert(op < PPCG_FLAG_OP_NUMBER);
+   vassert(typeOfIRExpr(irsb->tyenv,res)  == Ity_I64);
+   vassert(typeOfIRExpr(irsb->tyenv,argL) == Ity_I64);
+   vassert(typeOfIRExpr(irsb->tyenv,argR) == Ity_I64);
+
+#  define INT64_MIN 0x8000000000000000ULL
+
+#  define XOR2(_aa,_bb) \
+      binop(Iop_Xor64,(_aa),(_bb))
+
+#  define XOR3(_cc,_dd,_ee) \
+      binop(Iop_Xor64,binop(Iop_Xor64,(_cc),(_dd)),(_ee))
+
+#  define AND3(_ff,_gg,_hh) \
+      binop(Iop_And64,binop(Iop_And64,(_ff),(_gg)),(_hh))
+
+#define NOT(_jj) \
+      unop(Iop_Not64, (_jj))
+
+   switch (op) {
+   case /* 0  */ PPCG_FLAG_OP_ADD:
+   case /* 1  */ PPCG_FLAG_OP_ADDE:
+      /* (argL^argR^-1) & (argL^res) & (1<<63)  ? 1:0 */
+      // i.e. ((both_same_sign) & (sign_changed) & (sign_mask))
+      xer_ov 
+         = AND3( XOR3(argL,argR,mkU64(-1)),
+                 XOR2(argL,res),
+                 mkU64(INT64_MIN) );
+      /* xer_ov can only be 0 or 1<<63 */
+      xer_ov 
+         = unop(Iop_64to1, binop(Iop_Shr64, xer_ov, mkU8(63)));
+      break;
+      
+   case /* 2  */ PPCG_FLAG_OP_DIVW:
+      /* (argL == INT64_MIN && argR == -1) || argR == 0 */
+      xer_ov
+         = mkOR1(
+              mkAND1( 
+                 binop(Iop_CmpEQ64, argL, mkU64(INT64_MIN)),
+                 binop(Iop_CmpEQ64, argR, mkU64(-1)) 
+              ),
+              binop(Iop_CmpEQ64, argR, mkU64(0) ) 
+           );
+      break;
+
+   case /* 3  */ PPCG_FLAG_OP_DIVWU:
+      /* argR == 0 */
+      xer_ov 
+         = binop(Iop_CmpEQ64, argR, mkU64(0));
+      break;
+      
+   case /* 4  */ PPCG_FLAG_OP_MULLW: {
+      /* OV true if result can't be represented in 64 bits
+         i.e sHi != sign extension of sLo */
+      xer_ov 
+         = binop( Iop_CmpNE32,
+                  unop(Iop_64HIto32, res),
+                  binop( Iop_Sar32, 
+                         unop(Iop_64to32, res), 
+                         mkU8(31))
+                  );
+      break;
+   }
+      
+   case /* 5  */ PPCG_FLAG_OP_NEG:
+      /* argL == INT64_MIN */
+      xer_ov
+         = binop(Iop_CmpEQ64, argL, mkU64(INT64_MIN));
+      break;
+      
+   case /* 6  */ PPCG_FLAG_OP_SUBF:
+   case /* 7  */ PPCG_FLAG_OP_SUBFC:
+   case /* 8  */ PPCG_FLAG_OP_SUBFE:
+      /* ((~argL)^argR^-1) & ((~argL)^res) & (1<<63) ?1:0; */
+      xer_ov 
+         = AND3( XOR3(NOT(argL),argR,mkU64(-1)),
+                 XOR2(NOT(argL),res),
+                 mkU64(INT64_MIN) );
+      /* xer_ov can only be 0 or 1<<63 */
+      xer_ov 
+         = unop(Iop_64to1, binop(Iop_Shr64, xer_ov, mkU8(63)));
+      break;
+      
+   default: 
+      vex_printf("set_XER_OV: op = %u\n", op);
+      vpanic("set_XER_OV(ppc64)");
+   }
+   
+   /* xer_ov MUST denote either 0 or 1, no other value allowed */
+   putXER_OV( unop(Iop_1Uto8, xer_ov) );
+
+   /* Update the summary overflow */
+   putXER_SO( binop(Iop_Or8, getXER_SO(), getXER_OV()) );
+
+#  undef INT64_MIN
+#  undef AND3
+#  undef XOR3
+#  undef XOR2
+#  undef NOT
+}
+
+static void set_XER_OV ( IRType ty, UInt op, IRExpr* res,
+                         IRExpr* argL, IRExpr* argR )
+{
+   if (ty == Ity_I32)
+      set_XER_OV_32( op, res, argL, argR );
+   else
+      set_XER_OV_64( op, res, argL, argR );
+}
+
+
+
+/* RES is the result of doing OP on ARGL and ARGR with the old %XER.CA
+   value being OLDCA.  Set %XER.CA accordingly. */
+
+static void set_XER_CA_32 ( UInt op, IRExpr* res,
+                            IRExpr* argL, IRExpr* argR, IRExpr* oldca )
+{
+   IRExpr* xer_ca;
+   vassert(op < PPCG_FLAG_OP_NUMBER);
+   vassert(typeOfIRExpr(irsb->tyenv,res)   == Ity_I32);
+   vassert(typeOfIRExpr(irsb->tyenv,argL)  == Ity_I32);
+   vassert(typeOfIRExpr(irsb->tyenv,argR)  == Ity_I32);
+   vassert(typeOfIRExpr(irsb->tyenv,oldca) == Ity_I32);
+
+   /* Incoming oldca is assumed to hold the values 0 or 1 only.  This
+      seems reasonable given that it's always generated by
+      getXER_CA32(), which masks it accordingly.  In any case it being
+      0 or 1 is an invariant of the ppc guest state representation;
+      if it has any other value, that invariant has been violated. */
+
+   switch (op) {
+   case /* 0 */ PPCG_FLAG_OP_ADD:
+      /* res <u argL */
+      xer_ca
+         = unop(Iop_1Uto32, binop(Iop_CmpLT32U, res, argL));
+      break;
+      
+   case /* 1 */ PPCG_FLAG_OP_ADDE:
+      /* res <u argL || (old_ca==1 && res==argL) */
+      xer_ca 
+         = mkOR1( 
+              binop(Iop_CmpLT32U, res, argL),
+              mkAND1( 
+                 binop(Iop_CmpEQ32, oldca, mkU32(1)),
+                 binop(Iop_CmpEQ32, res, argL) 
+              ) 
+           );
+      xer_ca 
+         = unop(Iop_1Uto32, xer_ca);
+      break;
+      
+   case /* 8 */ PPCG_FLAG_OP_SUBFE:
+      /* res <u argR || (old_ca==1 && res==argR) */
+      xer_ca 
+         = mkOR1( 
+              binop(Iop_CmpLT32U, res, argR),
+              mkAND1( 
+                 binop(Iop_CmpEQ32, oldca, mkU32(1)),
+                 binop(Iop_CmpEQ32, res, argR) 
+              ) 
+           );
+      xer_ca 
+         = unop(Iop_1Uto32, xer_ca);
+      break;
+      
+   case /* 7 */ PPCG_FLAG_OP_SUBFC:
+   case /* 9 */ PPCG_FLAG_OP_SUBFI:
+      /* res <=u argR */
+      xer_ca
+         = unop(Iop_1Uto32, binop(Iop_CmpLE32U, res, argR));
+      break;
+      
+   case /* 10 */ PPCG_FLAG_OP_SRAW:
+      /* The shift amount is guaranteed to be in 0 .. 63 inclusive.
+         If it is <= 31, behave like SRAWI; else XER.CA is the sign
+         bit of argL. */
+      /* This term valid for shift amount < 32 only */
+      xer_ca
+         = binop(
+              Iop_And32,
+              binop(Iop_Sar32, argL, mkU8(31)),
+              binop( Iop_And32,
+                     argL,
+                     binop( Iop_Sub32,
+                            binop(Iop_Shl32, mkU32(1),
+                                             unop(Iop_32to8,argR)),
+                            mkU32(1) )
+                     )
+              );
+      xer_ca 
+         = IRExpr_Mux0X(
+              /* shift amt > 31 ? */
+              unop(Iop_1Uto8, binop(Iop_CmpLT32U, mkU32(31), argR)),
+              /* no -- be like srawi */
+              unop(Iop_1Uto32, binop(Iop_CmpNE32, xer_ca, mkU32(0))),
+              /* yes -- get sign bit of argL */
+              binop(Iop_Shr32, argL, mkU8(31))
+           );
+      break;
+
+   case /* 11 */ PPCG_FLAG_OP_SRAWI:
+      /* xer_ca is 1 iff src was negative and bits_shifted_out != 
+         0.  Since the shift amount is known to be in the range
+         0 .. 31 inclusive the following seems viable:
+         xer.ca == 1 iff the following is nonzero:
+         (argL >>s 31)           -- either all 0s or all 1s
+         & (argL & (1<<argR)-1)  -- the stuff shifted out */
+      xer_ca
+         = binop(
+              Iop_And32,
+              binop(Iop_Sar32, argL, mkU8(31)),
+              binop( Iop_And32,
+                     argL,
+                     binop( Iop_Sub32,
+                            binop(Iop_Shl32, mkU32(1),
+                                             unop(Iop_32to8,argR)),
+                            mkU32(1) )
+                     )
+              );
+      xer_ca 
+         = unop(Iop_1Uto32, binop(Iop_CmpNE32, xer_ca, mkU32(0)));
+      break;
+      
+   default: 
+      vex_printf("set_XER_CA: op = %u\n", op);
+      vpanic("set_XER_CA(ppc)");
+   }
+
+   /* xer_ca MUST denote either 0 or 1, no other value allowed */
+   putXER_CA( unop(Iop_32to8, xer_ca) );
+}
+
+static void set_XER_CA_64 ( UInt op, IRExpr* res,
+                            IRExpr* argL, IRExpr* argR, IRExpr* oldca )
+{
+   IRExpr* xer_ca;
+   vassert(op < PPCG_FLAG_OP_NUMBER);
+   vassert(typeOfIRExpr(irsb->tyenv,res)   == Ity_I64);
+   vassert(typeOfIRExpr(irsb->tyenv,argL)  == Ity_I64);
+   vassert(typeOfIRExpr(irsb->tyenv,argR)  == Ity_I64);
+   vassert(typeOfIRExpr(irsb->tyenv,oldca) == Ity_I64);
+
+   /* Incoming oldca is assumed to hold the values 0 or 1 only.  This
+      seems reasonable given that it's always generated by
+      getXER_CA32(), which masks it accordingly.  In any case it being
+      0 or 1 is an invariant of the ppc guest state representation;
+      if it has any other value, that invariant has been violated. */
+
+   switch (op) {
+   case /* 0 */ PPCG_FLAG_OP_ADD:
+      /* res <u argL */
+      xer_ca
+         = unop(Iop_1Uto32, binop(Iop_CmpLT64U, res, argL));
+      break;
+      
+   case /* 1 */ PPCG_FLAG_OP_ADDE:
+      /* res <u argL || (old_ca==1 && res==argL) */
+      xer_ca 
+         = mkOR1( 
+              binop(Iop_CmpLT64U, res, argL),
+              mkAND1( 
+                 binop(Iop_CmpEQ64, oldca, mkU64(1)),
+                 binop(Iop_CmpEQ64, res, argL) 
+                 ) 
+              );
+      xer_ca 
+         = unop(Iop_1Uto32, xer_ca);
+      break;
+      
+   case /* 8 */ PPCG_FLAG_OP_SUBFE:
+      /* res <u argR || (old_ca==1 && res==argR) */
+      xer_ca 
+         = mkOR1( 
+              binop(Iop_CmpLT64U, res, argR),
+              mkAND1( 
+                 binop(Iop_CmpEQ64, oldca, mkU64(1)),
+                 binop(Iop_CmpEQ64, res, argR) 
+              ) 
+           );
+      xer_ca 
+         = unop(Iop_1Uto32, xer_ca);
+      break;
+      
+   case /* 7 */ PPCG_FLAG_OP_SUBFC:
+   case /* 9 */ PPCG_FLAG_OP_SUBFI:
+      /* res <=u argR */
+      xer_ca
+         = unop(Iop_1Uto32, binop(Iop_CmpLE64U, res, argR));
+      break;
+      
+      
+   case /* 10 */ PPCG_FLAG_OP_SRAW:
+      /* The shift amount is guaranteed to be in 0 .. 31 inclusive.
+         If it is <= 31, behave like SRAWI; else XER.CA is the sign
+         bit of argL. */
+         /* This term valid for shift amount < 31 only */
+
+      xer_ca
+         = binop(
+              Iop_And64,
+              binop(Iop_Sar64, argL, mkU8(31)),
+              binop( Iop_And64,
+                     argL,
+                     binop( Iop_Sub64,
+                            binop(Iop_Shl64, mkU64(1),
+                                             unop(Iop_64to8,argR)),
+                            mkU64(1) )
+              )
+           );
+      xer_ca 
+         = IRExpr_Mux0X(
+              /* shift amt > 31 ? */
+              unop(Iop_1Uto8, binop(Iop_CmpLT64U, mkU64(31), argR)),
+              /* no -- be like srawi */
+              unop(Iop_1Uto32, binop(Iop_CmpNE64, xer_ca, mkU64(0))),
+              /* yes -- get sign bit of argL */
+              unop(Iop_64to32, binop(Iop_Shr64, argL, mkU8(63)))
+           );
+      break;
+      
+   case /* 11 */ PPCG_FLAG_OP_SRAWI:
+      /* xer_ca is 1 iff src was negative and bits_shifted_out != 0.
+         Since the shift amount is known to be in the range 0 .. 31
+         inclusive the following seems viable:
+         xer.ca == 1 iff the following is nonzero:
+         (argL >>s 31)           -- either all 0s or all 1s
+         & (argL & (1<<argR)-1)  -- the stuff shifted out */
+
+      xer_ca
+         = binop(
+              Iop_And64,
+              binop(Iop_Sar64, argL, mkU8(31)),
+              binop( Iop_And64,
+                     argL,
+                     binop( Iop_Sub64,
+                            binop(Iop_Shl64, mkU64(1),
+                                             unop(Iop_64to8,argR)),
+                            mkU64(1) )
+              )
+           );
+      xer_ca 
+         = unop(Iop_1Uto32, binop(Iop_CmpNE64, xer_ca, mkU64(0)));
+      break;
+      
+
+   case /* 12 */ PPCG_FLAG_OP_SRAD:
+      /* The shift amount is guaranteed to be in 0 .. 63 inclusive.
+         If it is <= 63, behave like SRADI; else XER.CA is the sign
+         bit of argL. */
+         /* This term valid for shift amount < 63 only */
+
+      xer_ca
+         = binop(
+              Iop_And64,
+              binop(Iop_Sar64, argL, mkU8(63)),
+              binop( Iop_And64,
+                     argL,
+                     binop( Iop_Sub64,
+                            binop(Iop_Shl64, mkU64(1),
+                                             unop(Iop_64to8,argR)),
+                            mkU64(1) )
+              )
+           );
+      xer_ca 
+         = IRExpr_Mux0X(
+              /* shift amt > 63 ? */
+              unop(Iop_1Uto8, binop(Iop_CmpLT64U, mkU64(63), argR)),
+              /* no -- be like sradi */
+              unop(Iop_1Uto32, binop(Iop_CmpNE64, xer_ca, mkU64(0))),
+              /* yes -- get sign bit of argL */
+              unop(Iop_64to32, binop(Iop_Shr64, argL, mkU8(63)))
+           );
+      break;
+
+
+   case /* 13 */ PPCG_FLAG_OP_SRADI:
+      /* xer_ca is 1 iff src was negative and bits_shifted_out != 0.
+         Since the shift amount is known to be in the range 0 .. 63
+         inclusive, the following seems viable:
+         xer.ca == 1 iff the following is nonzero:
+         (argL >>s 63)           -- either all 0s or all 1s
+         & (argL & (1<<argR)-1)  -- the stuff shifted out */
+
+      xer_ca
+         = binop(
+              Iop_And64,
+              binop(Iop_Sar64, argL, mkU8(63)),
+              binop( Iop_And64,
+                     argL,
+                     binop( Iop_Sub64,
+                            binop(Iop_Shl64, mkU64(1),
+                                             unop(Iop_64to8,argR)),
+                            mkU64(1) )
+              )
+           );
+      xer_ca 
+         = unop(Iop_1Uto32, binop(Iop_CmpNE64, xer_ca, mkU64(0)));
+      break;
+
+   default: 
+      vex_printf("set_XER_CA: op = %u\n", op);
+      vpanic("set_XER_CA(ppc64)");
+   }
+
+   /* xer_ca MUST denote either 0 or 1, no other value allowed */
+   putXER_CA( unop(Iop_32to8, xer_ca) );
+}
+
+static void set_XER_CA ( IRType ty, UInt op, IRExpr* res,
+                         IRExpr* argL, IRExpr* argR, IRExpr* oldca )
+{
+   if (ty == Ity_I32)
+      set_XER_CA_32( op, res, argL, argR, oldca );
+   else
+      set_XER_CA_64( op, res, argL, argR, oldca );
+}
+
+
+
+/*------------------------------------------------------------*/
+/*--- Read/write to guest-state                           --- */
+/*------------------------------------------------------------*/
+
+static IRExpr* /* :: Ity_I32/64 */ getGST ( PPC_GST reg )
+{
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   switch (reg) {
+   case PPC_GST_SPRG3_RO:
+      return IRExpr_Get( OFFB_SPRG3_RO, ty );
+
+   case PPC_GST_CIA: 
+      return IRExpr_Get( OFFB_CIA, ty );
+
+   case PPC_GST_LR: 
+      return IRExpr_Get( OFFB_LR, ty );
+
+   case PPC_GST_CTR: 
+      return IRExpr_Get( OFFB_CTR, ty );
+
+   case PPC_GST_VRSAVE: 
+      return IRExpr_Get( OFFB_VRSAVE, Ity_I32 );
+
+   case PPC_GST_VSCR:
+      return binop(Iop_And32, IRExpr_Get( OFFB_VSCR,Ity_I32 ),
+                              mkU32(MASK_VSCR_VALID));
+
+   case PPC_GST_CR: {
+      /* Synthesise the entire CR into a single word.  Expensive. */
+#     define FIELD(_n)                                               \
+         binop(Iop_Shl32,                                            \
+               unop(Iop_8Uto32,                                      \
+                    binop(Iop_Or8,                                   \
+                          binop(Iop_And8, getCR321(_n), mkU8(7<<1)), \
+                          binop(Iop_And8, getCR0(_n), mkU8(1))       \
+                    )                                                \
+               ),                                                    \
+               mkU8(4 * (7-(_n)))                                    \
+         )
+      return binop(Iop_Or32,
+                   binop(Iop_Or32,
+                         binop(Iop_Or32, FIELD(0), FIELD(1)),
+                         binop(Iop_Or32, FIELD(2), FIELD(3))
+                         ),
+                   binop(Iop_Or32,
+                         binop(Iop_Or32, FIELD(4), FIELD(5)),
+                         binop(Iop_Or32, FIELD(6), FIELD(7))
+                         )
+                   );
+#     undef FIELD
+   }
+
+   case PPC_GST_XER:
+      return binop(Iop_Or32,
+                   binop(Iop_Or32,
+                         binop( Iop_Shl32, getXER_SO32(), mkU8(31)),
+                         binop( Iop_Shl32, getXER_OV32(), mkU8(30))),
+                   binop(Iop_Or32,
+                         binop( Iop_Shl32, getXER_CA32(), mkU8(29)),
+                         getXER_BC32()));
+
+   default:
+      vex_printf("getGST(ppc): reg = %u", reg);
+      vpanic("getGST(ppc)");
+   }
+}
+
+/* Get a masked word from the given reg */
+static IRExpr* /* ::Ity_I32 */ getGST_masked ( PPC_GST reg, UInt mask )
+{
+   IRTemp val = newTemp(Ity_I32);
+   vassert( reg < PPC_GST_MAX );
+    
+   switch (reg) {
+
+   case PPC_GST_FPSCR: {
+      /* Vex-generated code expects the FPSCR to be set as follows:
+         all exceptions masked, round-to-nearest.
+         This corresponds to a FPSCR value of 0x0. */
+
+      /* We're only keeping track of the rounding mode,
+         so if the mask isn't asking for this, just return 0x0 */
+      if (mask & (MASK_FPSCR_RN|MASK_FPSCR_FPRF)) {
+         assign( val, IRExpr_Get( OFFB_FPROUND, Ity_I32 ) );
+      } else {
+         assign( val, mkU32(0x0) );
+      }
+      break;
+   }
+
+   default:
+      vex_printf("getGST_masked(ppc): reg = %u", reg);
+      vpanic("getGST_masked(ppc)");
+   }
+
+   if (mask != 0xFFFFFFFF) {
+      return binop(Iop_And32, mkexpr(val), mkU32(mask));
+   } else {
+      return mkexpr(val);
+   }
+}
+
+/* Fetch the specified REG[FLD] nibble (as per IBM/hardware notation)
+   and return it at the bottom of an I32; the top 27 bits are
+   guaranteed to be zero. */
+static IRExpr* /* ::Ity_I32 */ getGST_field ( PPC_GST reg, UInt fld )
+{
+   UInt shft, mask;
+
+   vassert( fld < 8 );
+   vassert( reg < PPC_GST_MAX );
+   
+   shft = 4*(7-fld);
+   mask = 0xF<<shft;
+
+   switch (reg) {
+   case PPC_GST_XER:
+      vassert(fld ==7);
+      return binop(Iop_Or32,
+                   binop(Iop_Or32,
+                         binop(Iop_Shl32, getXER_SO32(), mkU8(3)),
+                         binop(Iop_Shl32, getXER_OV32(), mkU8(2))),
+                   binop(      Iop_Shl32, getXER_CA32(), mkU8(1)));
+      break;
+
+   default:
+      if (shft == 0)
+         return getGST_masked( reg, mask );
+      else
+         return binop(Iop_Shr32,
+                      getGST_masked( reg, mask ),
+                      mkU8(toUChar( shft )));
+   }
+}
+
+static void putGST ( PPC_GST reg, IRExpr* src )
+{
+   IRType ty     = mode64 ? Ity_I64 : Ity_I32;
+   IRType ty_src = typeOfIRExpr(irsb->tyenv,src );
+   vassert( reg < PPC_GST_MAX );
+   switch (reg) {
+   case PPC_GST_IP_AT_SYSCALL: 
+      vassert( ty_src == ty );
+      stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL, src ) );
+      break;
+   case PPC_GST_CIA: 
+      vassert( ty_src == ty );
+      stmt( IRStmt_Put( OFFB_CIA, src ) );
+      break;
+   case PPC_GST_LR: 
+      vassert( ty_src == ty );
+      stmt( IRStmt_Put( OFFB_LR, src ) );
+      break;
+   case PPC_GST_CTR: 
+      vassert( ty_src == ty );
+      stmt( IRStmt_Put( OFFB_CTR, src ) );
+      break;
+   case PPC_GST_VRSAVE: 
+      vassert( ty_src == Ity_I32 );
+      stmt( IRStmt_Put( OFFB_VRSAVE,src));
+      break;
+   case PPC_GST_VSCR:
+      vassert( ty_src == Ity_I32 );
+      stmt( IRStmt_Put( OFFB_VSCR,
+                        binop(Iop_And32, src,
+                              mkU32(MASK_VSCR_VALID)) ) );
+      break;
+   case PPC_GST_XER:
+      vassert( ty_src == Ity_I32 );
+      putXER_SO( unop(Iop_32to8, binop(Iop_Shr32, src, mkU8(31))) );
+      putXER_OV( unop(Iop_32to8, binop(Iop_Shr32, src, mkU8(30))) );
+      putXER_CA( unop(Iop_32to8, binop(Iop_Shr32, src, mkU8(29))) );
+      putXER_BC( unop(Iop_32to8, src) );
+      break;
+      
+   case PPC_GST_EMWARN:
+      vassert( ty_src == Ity_I32 );
+      stmt( IRStmt_Put( OFFB_EMWARN,src) );
+      break;
+      
+   case PPC_GST_TISTART: 
+      vassert( ty_src == ty );
+      stmt( IRStmt_Put( OFFB_TISTART, src) );
+      break;
+      
+   case PPC_GST_TILEN: 
+      vassert( ty_src == ty );
+      stmt( IRStmt_Put( OFFB_TILEN, src) );
+      break;
+      
+   default:
+      vex_printf("putGST(ppc): reg = %u", reg);
+      vpanic("putGST(ppc)");
+   }
+}
+
+/* Write masked src to the given reg */
+static void putGST_masked ( PPC_GST reg, IRExpr* src, UInt mask )
+{
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   vassert( reg < PPC_GST_MAX );
+   vassert( typeOfIRExpr(irsb->tyenv,src ) == Ity_I32 );
+   
+   switch (reg) {
+   case PPC_GST_FPSCR: {
+      /* Allow writes to Rounding Mode */
+      if (mask & (MASK_FPSCR_RN|MASK_FPSCR_FPRF)) {
+         /* construct new fpround from new and old values as per mask:
+            new fpround = (src & (3 & mask)) | (fpround & (3 & ~mask)) */
+         stmt( 
+            IRStmt_Put( 
+               OFFB_FPROUND,
+               binop(
+                  Iop_Or32, 
+                  binop(Iop_And32, src, mkU32((MASK_FPSCR_RN|MASK_FPSCR_FPRF) & mask)),
+                  binop(
+                     Iop_And32, 
+                     IRExpr_Get(OFFB_FPROUND,Ity_I32),
+                     mkU32((MASK_FPSCR_RN|MASK_FPSCR_FPRF) & ~mask)
+                  )
+               )
+            )
+         );
+      }
+
+      /* Give EmWarn for attempted writes to:
+         - Exception Controls
+         - Non-IEEE Mode
+      */
+      if (mask & 0xFC) {  // Exception Control, Non-IEE mode
+         VexEmWarn ew = EmWarn_PPCexns;
+
+         /* If any of the src::exception_control bits are actually set,
+            side-exit to the next insn, reporting the warning,
+            so that Valgrind's dispatcher sees the warning. */
+         putGST( PPC_GST_EMWARN, mkU32(ew) );
+         stmt( 
+            IRStmt_Exit(
+               binop(Iop_CmpNE32, mkU32(ew), mkU32(EmWarn_NONE)),
+               Ijk_EmWarn,
+               mkSzConst( ty, nextInsnAddr()) ));
+      }
+
+      /* Ignore all other writes */
+      break;
+   }
+
+   default:
+      vex_printf("putGST_masked(ppc): reg = %u", reg);
+      vpanic("putGST_masked(ppc)");
+   }
+}
+
+/* Write the least significant nibble of src to the specified
+   REG[FLD] (as per IBM/hardware notation). */
+static void putGST_field ( PPC_GST reg, IRExpr* src, UInt fld )
+{
+   UInt shft, mask;
+
+   vassert( typeOfIRExpr(irsb->tyenv,src ) == Ity_I32 );
+   vassert( fld < 8 );
+   vassert( reg < PPC_GST_MAX );
+   
+   shft = 4*(7-fld);
+   mask = 0xF<<shft;
+
+   switch (reg) {
+   case PPC_GST_CR:
+      putCR0  (fld, binop(Iop_And8, mkU8(1   ), unop(Iop_32to8, src)));
+      putCR321(fld, binop(Iop_And8, mkU8(7<<1), unop(Iop_32to8, src)));
+      break;
+
+   default:
+      if (shft == 0) {
+         putGST_masked( reg, src, mask );
+      } else {
+         putGST_masked( reg,
+                        binop(Iop_Shl32, src, mkU8(toUChar(shft))),
+                        mask );
+      }
+   }
+}
+
+
+
+/*------------------------------------------------------------*/
+/*--- Integer Instruction Translation                     --- */
+/*------------------------------------------------------------*/
+
+/*
+  Integer Arithmetic Instructions
+*/
+static Bool dis_int_arith ( UInt theInstr )
+{
+   /* D-Form, XO-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UChar rD_addr = ifieldRegDS(theInstr);
+   UChar rA_addr = ifieldRegA(theInstr);
+   UInt  uimm16  = ifieldUIMM16(theInstr);
+   UChar rB_addr = ifieldRegB(theInstr);
+   UChar flag_OE = ifieldBIT10(theInstr);
+   UInt  opc2    = ifieldOPClo9(theInstr);
+   UChar flag_rC = ifieldBIT0(theInstr);
+
+   Long   simm16 = extend_s_16to64(uimm16);
+   IRType ty     = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp rA     = newTemp(ty);
+   IRTemp rB     = newTemp(ty);
+   IRTemp rD     = newTemp(ty);
+
+   Bool do_rc = False;
+
+   assign( rA, getIReg(rA_addr) );
+   assign( rB, getIReg(rB_addr) );         // XO-Form: rD, rA, rB
+
+   switch (opc1) {
+   /* D-Form */
+   case 0x0C: // addic  (Add Immediate Carrying, PPC32 p351
+      DIP("addic r%u,r%u,%d\n", rD_addr, rA_addr, (Int)simm16);
+      assign( rD, binop( mkSzOp(ty, Iop_Add8), mkexpr(rA),
+                         mkSzExtendS16(ty, uimm16) ) );
+      set_XER_CA( ty, PPCG_FLAG_OP_ADD, 
+                  mkexpr(rD), mkexpr(rA), mkSzExtendS16(ty, uimm16),
+                  mkSzImm(ty, 0)/*old xer.ca, which is ignored*/ );
+      break;
+    
+   case 0x0D: // addic. (Add Immediate Carrying and Record, PPC32 p352)
+      DIP("addic. r%u,r%u,%d\n", rD_addr, rA_addr, (Int)simm16);
+      assign( rD, binop( mkSzOp(ty, Iop_Add8), mkexpr(rA),
+                         mkSzExtendS16(ty, uimm16) ) );
+      set_XER_CA( ty, PPCG_FLAG_OP_ADD, 
+                  mkexpr(rD), mkexpr(rA), mkSzExtendS16(ty, uimm16),
+                  mkSzImm(ty, 0)/*old xer.ca, which is ignored*/ );
+      do_rc = True;  // Always record to CR
+      flag_rC = 1;
+      break;
+
+   case 0x0E: // addi   (Add Immediate, PPC32 p350)
+      // li rD,val   == addi rD,0,val
+      // la disp(rA) == addi rD,rA,disp
+      if ( rA_addr == 0 ) {
+         DIP("li r%u,%d\n", rD_addr, (Int)simm16);
+         assign( rD, mkSzExtendS16(ty, uimm16) );
+      } else {
+         DIP("addi r%u,r%u,%d\n", rD_addr, rA_addr, (Int)simm16);
+         assign( rD, binop( mkSzOp(ty, Iop_Add8), mkexpr(rA),
+                            mkSzExtendS16(ty, uimm16) ) );
+      }
+      break;
+
+   case 0x0F: // addis  (Add Immediate Shifted, PPC32 p353)
+      // lis rD,val == addis rD,0,val
+      if ( rA_addr == 0 ) {
+         DIP("lis r%u,%d\n", rD_addr, (Int)simm16);
+         assign( rD, mkSzExtendS32(ty, uimm16 << 16) );
+      } else {
+         DIP("addis r%u,r%u,0x%x\n", rD_addr, rA_addr, (Int)simm16);
+         assign( rD, binop( mkSzOp(ty, Iop_Add8), mkexpr(rA),
+                            mkSzExtendS32(ty, uimm16 << 16) ) );
+      }
+      break;
+
+   case 0x07: // mulli    (Multiply Low Immediate, PPC32 p490)
+      DIP("mulli r%u,r%u,%d\n", rD_addr, rA_addr, (Int)simm16);
+      if (mode64)
+         assign( rD, unop(Iop_128to64,
+                          binop(Iop_MullS64, mkexpr(rA),
+                                mkSzExtendS16(ty, uimm16))) );
+      else
+         assign( rD, unop(Iop_64to32,
+                          binop(Iop_MullS32, mkexpr(rA),
+                                mkSzExtendS16(ty, uimm16))) );
+      break;
+
+   case 0x08: // subfic   (Subtract from Immediate Carrying, PPC32 p540)
+      DIP("subfic r%u,r%u,%d\n", rD_addr, rA_addr, (Int)simm16);
+      // rD = simm16 - rA
+      assign( rD, binop( mkSzOp(ty, Iop_Sub8),
+                         mkSzExtendS16(ty, uimm16),
+                         mkexpr(rA)) );
+      set_XER_CA( ty, PPCG_FLAG_OP_SUBFI, 
+                  mkexpr(rD), mkexpr(rA), mkSzExtendS16(ty, uimm16),
+                  mkSzImm(ty, 0)/*old xer.ca, which is ignored*/ );
+      break;
+
+   /* XO-Form */
+   case 0x1F:
+      do_rc = True;    // All below record to CR
+      
+      switch (opc2) {
+      case 0x10A: // add  (Add, PPC32 p347)
+         DIP("add%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         assign( rD, binop( mkSzOp(ty, Iop_Add8),
+                            mkexpr(rA), mkexpr(rB) ) );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_ADD,
+                        mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+         }
+         break;
+
+      case 0x00A: // addc      (Add Carrying, PPC32 p348)
+         DIP("addc%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         assign( rD, binop( mkSzOp(ty, Iop_Add8),
+                            mkexpr(rA), mkexpr(rB)) );
+         set_XER_CA( ty, PPCG_FLAG_OP_ADD, 
+                     mkexpr(rD), mkexpr(rA), mkexpr(rB),
+                     mkSzImm(ty, 0)/*old xer.ca, which is ignored*/ );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_ADD, 
+                        mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+         }
+         break;
+         
+      case 0x08A: { // adde      (Add Extended, PPC32 p349)
+         IRTemp old_xer_ca = newTemp(ty);
+         DIP("adde%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         // rD = rA + rB + XER[CA]
+         assign( old_xer_ca, mkWidenFrom32(ty, getXER_CA32(), False) );
+         assign( rD, binop( mkSzOp(ty, Iop_Add8), mkexpr(rA),
+                            binop( mkSzOp(ty, Iop_Add8),
+                                   mkexpr(rB), mkexpr(old_xer_ca))) );
+         set_XER_CA( ty, PPCG_FLAG_OP_ADDE, 
+                     mkexpr(rD), mkexpr(rA), mkexpr(rB),
+                     mkexpr(old_xer_ca) );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_ADDE, 
+                        mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+         }
+         break;
+      }
+
+      case 0x0EA: { // addme     (Add to Minus One Extended, PPC32 p354)
+         IRTemp old_xer_ca = newTemp(ty);
+         IRExpr *min_one;
+         if (rB_addr != 0) {
+            vex_printf("dis_int_arith(ppc)(addme,rB_addr)\n");
+            return False;
+         }
+         DIP("addme%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         // rD = rA + (-1) + XER[CA]
+         // => Just another form of adde
+         assign( old_xer_ca, mkWidenFrom32(ty, getXER_CA32(), False) );
+         min_one = mkSzImm(ty, (Long)-1);
+         assign( rD, binop( mkSzOp(ty, Iop_Add8), mkexpr(rA),
+                            binop( mkSzOp(ty, Iop_Add8),
+                                   min_one, mkexpr(old_xer_ca)) ));
+         set_XER_CA( ty, PPCG_FLAG_OP_ADDE,
+                     mkexpr(rD), mkexpr(rA), min_one,
+                     mkexpr(old_xer_ca) );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_ADDE, 
+                        mkexpr(rD), mkexpr(rA), min_one );
+         }
+         break;
+      }
+
+      case 0x0CA: { // addze      (Add to Zero Extended, PPC32 p355)
+         IRTemp old_xer_ca = newTemp(ty);
+         if (rB_addr != 0) {
+            vex_printf("dis_int_arith(ppc)(addze,rB_addr)\n");
+            return False;
+         }
+         DIP("addze%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         // rD = rA + (0) + XER[CA]
+         // => Just another form of adde
+         assign( old_xer_ca, mkWidenFrom32(ty, getXER_CA32(), False) );
+         assign( rD, binop( mkSzOp(ty, Iop_Add8),
+                            mkexpr(rA), mkexpr(old_xer_ca)) );
+         set_XER_CA( ty, PPCG_FLAG_OP_ADDE, 
+                     mkexpr(rD), mkexpr(rA), mkSzImm(ty, 0), 
+                     mkexpr(old_xer_ca) );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_ADDE, 
+                        mkexpr(rD), mkexpr(rA), mkSzImm(ty, 0) );
+         }
+         break;
+      }
+
+      case 0x1EB: // divw       (Divide Word, PPC32 p388)
+         DIP("divw%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         if (mode64) {
+            /* Note:
+               XER settings are mode independent, and reflect the 
+               overflow of the low-order 32bit result
+               CR0[LT|GT|EQ] are undefined if flag_rC && mode64
+            */
+            /* rD[hi32] are undefined: setting them to sign of lo32
+                - makes set_CR0 happy */
+            IRExpr* dividend = mk64lo32Sto64( mkexpr(rA) );
+            IRExpr* divisor  = mk64lo32Sto64( mkexpr(rB) );
+            assign( rD, mk64lo32Uto64( binop(Iop_DivS64, dividend,
+                                                         divisor) ) );
+            if (flag_OE) {
+               set_XER_OV( ty, PPCG_FLAG_OP_DIVW, 
+                           mkexpr(rD), dividend, divisor );
+            }
+         } else {
+            assign( rD, binop(Iop_DivS32, mkexpr(rA), mkexpr(rB)) );
+            if (flag_OE) {
+               set_XER_OV( ty, PPCG_FLAG_OP_DIVW, 
+                           mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+            }
+         }
+         /* Note:
+            if (0x8000_0000 / -1) or (x / 0)
+            => rD=undef, if(flag_rC) CR7=undef, if(flag_OE) XER_OV=1
+            => But _no_ exception raised. */
+         break;
+
+      case 0x1CB: // divwu      (Divide Word Unsigned, PPC32 p389)
+         DIP("divwu%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         if (mode64) {
+            /* Note:
+               XER settings are mode independent, and reflect the 
+               overflow of the low-order 32bit result
+               CR0[LT|GT|EQ] are undefined if flag_rC && mode64
+            */
+            IRExpr* dividend = mk64lo32Uto64( mkexpr(rA) );
+            IRExpr* divisor  = mk64lo32Uto64( mkexpr(rB) );
+            assign( rD, mk64lo32Uto64( binop(Iop_DivU64, dividend,
+                                                         divisor) ) );
+            if (flag_OE) {
+               set_XER_OV( ty, PPCG_FLAG_OP_DIVWU, 
+                           mkexpr(rD), dividend, divisor );
+            }
+         } else {
+            assign( rD, binop(Iop_DivU32, mkexpr(rA), mkexpr(rB)) );
+            if (flag_OE) {
+               set_XER_OV( ty, PPCG_FLAG_OP_DIVWU, 
+                           mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+            }
+         }
+         /* Note: ditto comment divw, for (x / 0) */
+         break;
+
+      case 0x04B: // mulhw      (Multiply High Word, PPC32 p488)
+         if (flag_OE != 0) {
+            vex_printf("dis_int_arith(ppc)(mulhw,flag_OE)\n");
+            return False;
+         }
+         DIP("mulhw%s r%u,r%u,r%u\n", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         if (mode64) {
+            /* rD[hi32] are undefined: setting them to sign of lo32
+                - makes set_CR0 happy */
+            assign( rD, binop(Iop_Sar64,
+                           binop(Iop_Mul64,
+                                 mk64lo32Sto64( mkexpr(rA) ),
+                                 mk64lo32Sto64( mkexpr(rB) )),
+                              mkU8(32)) );
+         } else {
+            assign( rD, unop(Iop_64HIto32,
+                             binop(Iop_MullS32,
+                                   mkexpr(rA), mkexpr(rB))) );
+         }
+         break;
+
+      case 0x00B: // mulhwu    (Multiply High Word Unsigned, PPC32 p489)
+         if (flag_OE != 0) {
+            vex_printf("dis_int_arith(ppc)(mulhwu,flag_OE)\n");
+            return False;
+         }
+         DIP("mulhwu%s r%u,r%u,r%u\n", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         if (mode64) {
+            /* rD[hi32] are undefined: setting them to sign of lo32
+                - makes set_CR0 happy */
+            assign( rD, binop(Iop_Sar64,
+                           binop(Iop_Mul64,
+                                 mk64lo32Uto64( mkexpr(rA) ),
+                                 mk64lo32Uto64( mkexpr(rB) ) ),
+                              mkU8(32)) );
+         } else {
+            assign( rD, unop(Iop_64HIto32, 
+                             binop(Iop_MullU32,
+                                   mkexpr(rA), mkexpr(rB))) );
+         }
+         break;
+         
+      case 0x0EB: // mullw      (Multiply Low Word, PPC32 p491)
+         DIP("mullw%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         if (mode64) {
+            /* rD[hi32] are undefined: setting them to sign of lo32
+                - set_XER_OV() and set_CR0() depend on this */
+            IRExpr *a = unop(Iop_64to32, mkexpr(rA) );
+            IRExpr *b = unop(Iop_64to32, mkexpr(rB) );
+            assign( rD, binop(Iop_MullS32, a, b) );
+            if (flag_OE) {
+               set_XER_OV( ty, PPCG_FLAG_OP_MULLW, 
+                           mkexpr(rD),
+                           unop(Iop_32Uto64, a), unop(Iop_32Uto64, b) );
+            }
+         } else {
+            assign( rD, unop(Iop_64to32,
+                             binop(Iop_MullU32,
+                                   mkexpr(rA), mkexpr(rB))) );
+            if (flag_OE) {
+               set_XER_OV( ty, PPCG_FLAG_OP_MULLW, 
+                           mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+            }
+         }
+         break;
+
+      case 0x068: // neg        (Negate, PPC32 p493)
+         if (rB_addr != 0) {
+            vex_printf("dis_int_arith(ppc)(neg,rB_addr)\n");
+            return False;
+         }
+         DIP("neg%s%s r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr);
+         // rD = (~rA) + 1
+         assign( rD, binop( mkSzOp(ty, Iop_Add8),
+                            unop( mkSzOp(ty, Iop_Not8), mkexpr(rA) ),
+                            mkSzImm(ty, 1)) );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_NEG, 
+                        mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+         }
+         break;
+
+      case 0x028: // subf       (Subtract From, PPC32 p537)
+         DIP("subf%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         // rD = rB - rA
+         assign( rD, binop( mkSzOp(ty, Iop_Sub8),
+                            mkexpr(rB), mkexpr(rA)) );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_SUBF, 
+                        mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+         }
+         break;
+
+      case 0x008: // subfc      (Subtract from Carrying, PPC32 p538)
+         DIP("subfc%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         // rD = rB - rA
+         assign( rD, binop( mkSzOp(ty, Iop_Sub8),
+                            mkexpr(rB), mkexpr(rA)) );
+         set_XER_CA( ty, PPCG_FLAG_OP_SUBFC, 
+                     mkexpr(rD), mkexpr(rA), mkexpr(rB),
+                     mkSzImm(ty, 0)/*old xer.ca, which is ignored*/ );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_SUBFC, 
+                        mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+         }
+         break;
+         
+      case 0x088: {// subfe      (Subtract from Extended, PPC32 p539)
+         IRTemp old_xer_ca = newTemp(ty);
+         DIP("subfe%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         // rD = (log not)rA + rB + XER[CA]
+         assign( old_xer_ca, mkWidenFrom32(ty, getXER_CA32(), False) );
+         assign( rD, binop( mkSzOp(ty, Iop_Add8),
+                            unop( mkSzOp(ty, Iop_Not8), mkexpr(rA)),
+                            binop( mkSzOp(ty, Iop_Add8),
+                                   mkexpr(rB), mkexpr(old_xer_ca))) );
+         set_XER_CA( ty, PPCG_FLAG_OP_SUBFE, 
+                     mkexpr(rD), mkexpr(rA), mkexpr(rB), 
+                     mkexpr(old_xer_ca) );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_SUBFE, 
+                        mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+         }
+         break;
+      }
+
+      case 0x0E8: { // subfme    (Subtract from -1 Extended, PPC32 p541)
+         IRTemp old_xer_ca = newTemp(ty);
+         IRExpr *min_one;
+         if (rB_addr != 0) {
+            vex_printf("dis_int_arith(ppc)(subfme,rB_addr)\n");
+            return False;
+         }
+         DIP("subfme%s%s r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr);
+         // rD = (log not)rA + (-1) + XER[CA]
+         // => Just another form of subfe
+         assign( old_xer_ca, mkWidenFrom32(ty, getXER_CA32(), False) );
+         min_one = mkSzImm(ty, (Long)-1);
+         assign( rD, binop( mkSzOp(ty, Iop_Add8),
+                            unop( mkSzOp(ty, Iop_Not8), mkexpr(rA)),
+                            binop( mkSzOp(ty, Iop_Add8),
+                                   min_one, mkexpr(old_xer_ca))) );
+         set_XER_CA( ty, PPCG_FLAG_OP_SUBFE,
+                     mkexpr(rD), mkexpr(rA), min_one,
+                     mkexpr(old_xer_ca) );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_SUBFE, 
+                        mkexpr(rD), mkexpr(rA), min_one );
+         }
+         break;
+      }
+
+      case 0x0C8: { // subfze  (Subtract from Zero Extended, PPC32 p542)
+         IRTemp old_xer_ca = newTemp(ty);
+         if (rB_addr != 0) {
+            vex_printf("dis_int_arith(ppc)(subfze,rB_addr)\n");
+            return False;
+         }
+         DIP("subfze%s%s r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr);
+         // rD = (log not)rA + (0) + XER[CA]
+         // => Just another form of subfe
+         assign( old_xer_ca, mkWidenFrom32(ty, getXER_CA32(), False) );
+         assign( rD, binop( mkSzOp(ty, Iop_Add8),
+                           unop( mkSzOp(ty, Iop_Not8),
+                                 mkexpr(rA)), mkexpr(old_xer_ca)) );
+         set_XER_CA( ty, PPCG_FLAG_OP_SUBFE,
+                     mkexpr(rD), mkexpr(rA), mkSzImm(ty, 0), 
+                     mkexpr(old_xer_ca) );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_SUBFE,
+                        mkexpr(rD), mkexpr(rA), mkSzImm(ty, 0) );
+         }
+         break;
+      }
+
+
+      /* 64bit Arithmetic */
+      case 0x49:  // mulhd (Multiply High DWord, PPC64 p539)
+         if (flag_OE != 0) {
+            vex_printf("dis_int_arith(ppc)(mulhd,flagOE)\n");
+            return False;
+         }
+         DIP("mulhd%s r%u,r%u,r%u\n", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         assign( rD, unop(Iop_128HIto64, 
+                          binop(Iop_MullS64,
+                                mkexpr(rA), mkexpr(rB))) );
+
+         break;
+
+      case 0x9:   // mulhdu  (Multiply High DWord Unsigned, PPC64 p540)
+         if (flag_OE != 0) {
+            vex_printf("dis_int_arith(ppc)(mulhdu,flagOE)\n");
+            return False;
+         }
+         DIP("mulhdu%s r%u,r%u,r%u\n", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         assign( rD, unop(Iop_128HIto64, 
+                          binop(Iop_MullU64,
+                                mkexpr(rA), mkexpr(rB))) );
+         break;
+
+      case 0xE9:  // mulld (Multiply Low DWord, PPC64 p543)
+         DIP("mulld%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         assign( rD, binop(Iop_Mul64, mkexpr(rA), mkexpr(rB)) );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_MULLW, 
+                        mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+         }
+         break;
+
+      case 0x1E9: // divd (Divide DWord, PPC64 p419)
+         DIP("divd%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         assign( rD, binop(Iop_DivS64, mkexpr(rA), mkexpr(rB)) );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_DIVW, 
+                        mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+         }
+         break;
+         /* Note:
+            if (0x8000_0000_0000_0000 / -1) or (x / 0)
+            => rD=undef, if(flag_rC) CR7=undef, if(flag_OE) XER_OV=1
+            => But _no_ exception raised. */
+
+      case 0x1C9: // divdu (Divide DWord Unsigned, PPC64 p420)
+         DIP("divdu%s%s r%u,r%u,r%u\n",
+             flag_OE ? "o" : "", flag_rC ? ".":"",
+             rD_addr, rA_addr, rB_addr);
+         assign( rD, binop(Iop_DivU64, mkexpr(rA), mkexpr(rB)) );
+         if (flag_OE) {
+            set_XER_OV( ty, PPCG_FLAG_OP_DIVWU, 
+                        mkexpr(rD), mkexpr(rA), mkexpr(rB) );
+         }
+         break;
+         /* Note: ditto comment divd, for (x / 0) */
+
+      default:
+         vex_printf("dis_int_arith(ppc)(opc2)\n");
+         return False;
+      }
+      break;
+
+   default:
+      vex_printf("dis_int_arith(ppc)(opc1)\n");
+      return False;
+   }
+
+   putIReg( rD_addr, mkexpr(rD) );
+
+   if (do_rc && flag_rC) {
+      set_CR0( mkexpr(rD) );
+   }
+   return True;
+}
+
+
+
+/*
+  Integer Compare Instructions
+*/
+static Bool dis_int_cmp ( UInt theInstr )
+{
+   /* D-Form, X-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UChar crfD    = toUChar( IFIELD( theInstr, 23, 3 ) );
+   UChar b22     = toUChar( IFIELD( theInstr, 22, 1 ) );
+   UChar flag_L  = toUChar( IFIELD( theInstr, 21, 1 ) );
+   UChar rA_addr = ifieldRegA(theInstr);
+   UInt  uimm16  = ifieldUIMM16(theInstr);
+   UChar rB_addr = ifieldRegB(theInstr);
+   UInt  opc2    = ifieldOPClo10(theInstr);
+   UChar b0      = ifieldBIT0(theInstr);
+
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   IRExpr *a = getIReg(rA_addr);
+   IRExpr *b;
+
+   if (!mode64 && flag_L==1) {  // L==1 invalid for 32 bit.
+      vex_printf("dis_int_cmp(ppc)(flag_L)\n");
+      return False;
+   }
+   
+   if (b22 != 0) {
+      vex_printf("dis_int_cmp(ppc)(b22)\n");
+      return False;
+   }
+   
+   switch (opc1) {
+   case 0x0B: // cmpi (Compare Immediate, PPC32 p368)
+      DIP("cmpi cr%u,%u,r%u,%d\n", crfD, flag_L, rA_addr,
+          (Int)extend_s_16to32(uimm16));
+      b = mkSzExtendS16( ty, uimm16 );
+      if (flag_L == 1) {
+         putCR321(crfD, unop(Iop_64to8, binop(Iop_CmpORD64S, a, b)));
+      } else {
+         a = mkNarrowTo32( ty, a );
+         b = mkNarrowTo32( ty, b );
+         putCR321(crfD, unop(Iop_32to8, binop(Iop_CmpORD32S, a, b)));
+      }
+      putCR0( crfD, getXER_SO() );
+      break;
+      
+   case 0x0A: // cmpli (Compare Logical Immediate, PPC32 p370)
+      DIP("cmpli cr%u,%u,r%u,0x%x\n", crfD, flag_L, rA_addr, uimm16);
+      b = mkSzImm( ty, uimm16 );
+      if (flag_L == 1) {
+         putCR321(crfD, unop(Iop_64to8, binop(Iop_CmpORD64U, a, b)));
+      } else {
+         a = mkNarrowTo32( ty, a );
+         b = mkNarrowTo32( ty, b );
+         putCR321(crfD, unop(Iop_32to8, binop(Iop_CmpORD32U, a, b)));
+      }
+      putCR0( crfD, getXER_SO() );
+      break;
+      
+   /* X Form */
+   case 0x1F:
+      if (b0 != 0) {
+         vex_printf("dis_int_cmp(ppc)(0x1F,b0)\n");
+         return False;
+      }
+      b = getIReg(rB_addr);
+
+      switch (opc2) {
+      case 0x000: // cmp (Compare, PPC32 p367)
+         DIP("cmp cr%u,%u,r%u,r%u\n", crfD, flag_L, rA_addr, rB_addr);
+         /* Comparing a reg with itself produces a result which
+            doesn't depend on the contents of the reg.  Therefore
+            remove the false dependency, which has been known to cause
+            memcheck to produce false errors. */
+         if (rA_addr == rB_addr)
+            a = b = typeOfIRExpr(irsb->tyenv,a) == Ity_I64
+                    ? mkU64(0)  : mkU32(0);
+         if (flag_L == 1) {
+            putCR321(crfD, unop(Iop_64to8, binop(Iop_CmpORD64S, a, b)));
+         } else {
+            a = mkNarrowTo32( ty, a );
+            b = mkNarrowTo32( ty, b );
+            putCR321(crfD, unop(Iop_32to8,binop(Iop_CmpORD32S, a, b)));
+         }
+         putCR0( crfD, getXER_SO() );
+         break;
+         
+      case 0x020: // cmpl (Compare Logical, PPC32 p369)
+         DIP("cmpl cr%u,%u,r%u,r%u\n", crfD, flag_L, rA_addr, rB_addr);
+         /* Comparing a reg with itself produces a result which
+            doesn't depend on the contents of the reg.  Therefore
+            remove the false dependency, which has been known to cause
+            memcheck to produce false errors. */
+         if (rA_addr == rB_addr)
+            a = b = typeOfIRExpr(irsb->tyenv,a) == Ity_I64
+                    ? mkU64(0)  : mkU32(0);
+         if (flag_L == 1) {
+            putCR321(crfD, unop(Iop_64to8, binop(Iop_CmpORD64U, a, b)));
+         } else {
+            a = mkNarrowTo32( ty, a );
+            b = mkNarrowTo32( ty, b );
+            putCR321(crfD, unop(Iop_32to8, binop(Iop_CmpORD32U, a, b)));
+         }
+         putCR0( crfD, getXER_SO() );
+         break;
+
+      default:
+         vex_printf("dis_int_cmp(ppc)(opc2)\n");
+         return False;
+      }
+      break;
+      
+   default:
+      vex_printf("dis_int_cmp(ppc)(opc1)\n");
+      return False;
+   }
+   
+   return True;
+}
+
+
+/*
+  Integer Logical Instructions
+*/
+static Bool dis_int_logic ( UInt theInstr )
+{
+   /* D-Form, X-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UChar rS_addr = ifieldRegDS(theInstr);
+   UChar rA_addr = ifieldRegA(theInstr);
+   UInt  uimm16  = ifieldUIMM16(theInstr);
+   UChar rB_addr = ifieldRegB(theInstr);
+   UInt  opc2    = ifieldOPClo10(theInstr);
+   UChar flag_rC = ifieldBIT0(theInstr);
+   
+   IRType ty     = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp rS     = newTemp(ty);
+   IRTemp rA     = newTemp(ty);
+   IRTemp rB     = newTemp(ty);
+   IRExpr* irx;
+   Bool do_rc    = False;
+
+   assign( rS, getIReg(rS_addr) );
+   assign( rB, getIReg(rB_addr) );
+   
+   switch (opc1) {
+   case 0x1C: // andi. (AND Immediate, PPC32 p358)
+      DIP("andi. r%u,r%u,0x%x\n", rA_addr, rS_addr, uimm16);
+      assign( rA, binop( mkSzOp(ty, Iop_And8), mkexpr(rS),
+                         mkSzImm(ty, uimm16)) );
+      do_rc = True;  // Always record to CR
+      flag_rC = 1;
+      break;
+      
+   case 0x1D: // andis. (AND Immediate Shifted, PPC32 p359)
+      DIP("andis r%u,r%u,0x%x\n", rA_addr, rS_addr, uimm16);
+      assign( rA, binop( mkSzOp(ty, Iop_And8), mkexpr(rS),
+                         mkSzImm(ty, uimm16 << 16)) );
+      do_rc = True;  // Always record to CR
+      flag_rC = 1;
+      break;
+
+   case 0x18: // ori (OR Immediate, PPC32 p497)
+      DIP("ori r%u,r%u,0x%x\n", rA_addr, rS_addr, uimm16);
+      assign( rA, binop( mkSzOp(ty, Iop_Or8), mkexpr(rS),
+                         mkSzImm(ty, uimm16)) );
+      break;
+
+   case 0x19: // oris (OR Immediate Shifted, PPC32 p498)
+      DIP("oris r%u,r%u,0x%x\n", rA_addr, rS_addr, uimm16);
+      assign( rA, binop( mkSzOp(ty, Iop_Or8), mkexpr(rS),
+                         mkSzImm(ty, uimm16 << 16)) );
+      break;
+
+   case 0x1A: // xori (XOR Immediate, PPC32 p550)
+      DIP("xori r%u,r%u,0x%x\n", rA_addr, rS_addr, uimm16);
+      assign( rA, binop( mkSzOp(ty, Iop_Xor8), mkexpr(rS),
+                         mkSzImm(ty, uimm16)) );
+      break;
+
+   case 0x1B: // xoris (XOR Immediate Shifted, PPC32 p551)
+      DIP("xoris r%u,r%u,0x%x\n", rA_addr, rS_addr, uimm16);
+      assign( rA, binop( mkSzOp(ty, Iop_Xor8), mkexpr(rS),
+                         mkSzImm(ty, uimm16 << 16)) );
+      break;
+
+   /* X Form */
+   case 0x1F:
+      do_rc = True;    // All below record to CR
+
+      switch (opc2) {
+      case 0x01C: // and (AND, PPC32 p356)
+         DIP("and%s r%u,r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr, rB_addr);
+         assign(rA, binop( mkSzOp(ty, Iop_And8),
+                           mkexpr(rS), mkexpr(rB)));
+         break;
+         
+      case 0x03C: // andc (AND with Complement, PPC32 p357)
+         DIP("andc%s r%u,r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr, rB_addr);
+         assign(rA, binop( mkSzOp(ty, Iop_And8), mkexpr(rS),
+                           unop( mkSzOp(ty, Iop_Not8),
+                                 mkexpr(rB))));
+         break;
+         
+      case 0x01A: { // cntlzw (Count Leading Zeros Word, PPC32 p371)
+         IRExpr* lo32;
+         if (rB_addr!=0) {
+            vex_printf("dis_int_logic(ppc)(cntlzw,rB_addr)\n");
+            return False;
+         }
+         DIP("cntlzw%s r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr);
+         
+         // mode64: count in low word only
+         lo32 = mode64 ? unop(Iop_64to32, mkexpr(rS)) : mkexpr(rS);
+         
+         // Iop_Clz32 undefined for arg==0, so deal with that case:
+         irx =  binop(Iop_CmpNE32, lo32, mkU32(0));
+         assign(rA, mkWidenFrom32(ty,
+                         IRExpr_Mux0X( unop(Iop_1Uto8, irx),
+                                       mkU32(32),
+                                       unop(Iop_Clz32, lo32)),
+                         False));
+
+         // TODO: alternatively: assign(rA, verbose_Clz32(rS));
+         break;
+      }
+         
+      case 0x11C: // eqv (Equivalent, PPC32 p396)
+         DIP("eqv%s r%u,r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr, rB_addr);
+         assign( rA, unop( mkSzOp(ty, Iop_Not8),
+                           binop( mkSzOp(ty, Iop_Xor8),
+                                  mkexpr(rS), mkexpr(rB))) );
+         break;
+
+      case 0x3BA: // extsb (Extend Sign Byte, PPC32 p397
+         if (rB_addr!=0) {
+            vex_printf("dis_int_logic(ppc)(extsb,rB_addr)\n");
+            return False;
+         }
+         DIP("extsb%s r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr);
+         if (mode64)
+            assign( rA, unop(Iop_8Sto64, unop(Iop_64to8, mkexpr(rS))) );
+         else
+            assign( rA, unop(Iop_8Sto32, unop(Iop_32to8, mkexpr(rS))) );
+         break;
+
+      case 0x39A: // extsh (Extend Sign Half Word, PPC32 p398)
+         if (rB_addr!=0) {
+            vex_printf("dis_int_logic(ppc)(extsh,rB_addr)\n");
+            return False;
+         }
+         DIP("extsh%s r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr);
+         if (mode64)
+            assign( rA, unop(Iop_16Sto64,
+                             unop(Iop_64to16, mkexpr(rS))) );
+         else
+            assign( rA, unop(Iop_16Sto32,
+                             unop(Iop_32to16, mkexpr(rS))) );
+         break;
+
+      case 0x1DC: // nand (NAND, PPC32 p492)
+         DIP("nand%s r%u,r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr, rB_addr);
+         assign( rA, unop( mkSzOp(ty, Iop_Not8),
+                           binop( mkSzOp(ty, Iop_And8),
+                                  mkexpr(rS), mkexpr(rB))) );
+         break;
+         
+      case 0x07C: // nor (NOR, PPC32 p494)
+         DIP("nor%s r%u,r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr, rB_addr);
+         assign( rA, unop( mkSzOp(ty, Iop_Not8),
+                           binop( mkSzOp(ty, Iop_Or8),
+                                  mkexpr(rS), mkexpr(rB))) );
+         break;
+
+      case 0x1BC: // or (OR, PPC32 p495)
+         if ((!flag_rC) && rS_addr == rB_addr) {
+            DIP("mr r%u,r%u\n", rA_addr, rS_addr);
+            assign( rA, mkexpr(rS) );
+         } else {
+            DIP("or%s r%u,r%u,r%u\n",
+                flag_rC ? ".":"", rA_addr, rS_addr, rB_addr);
+            assign( rA, binop( mkSzOp(ty, Iop_Or8),
+                               mkexpr(rS), mkexpr(rB)) );
+         }
+         break;
+
+      case 0x19C: // orc  (OR with Complement, PPC32 p496)
+         DIP("orc%s r%u,r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr, rB_addr);
+         assign( rA, binop( mkSzOp(ty, Iop_Or8), mkexpr(rS),
+                            unop(mkSzOp(ty, Iop_Not8), mkexpr(rB))));
+         break;
+         
+      case 0x13C: // xor (XOR, PPC32 p549)
+         DIP("xor%s r%u,r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr, rB_addr);
+         assign( rA, binop( mkSzOp(ty, Iop_Xor8),
+                            mkexpr(rS), mkexpr(rB)) );
+         break;
+
+
+      /* 64bit Integer Logical Instructions */
+      case 0x3DA: // extsw (Extend Sign Word, PPC64 p430)
+         if (rB_addr!=0) {
+            vex_printf("dis_int_logic(ppc)(extsw,rB_addr)\n");
+            return False;
+         }
+         DIP("extsw%s r%u,r%u\n", flag_rC ? ".":"", rA_addr, rS_addr);
+         assign(rA, unop(Iop_32Sto64, unop(Iop_64to32, mkexpr(rS))));
+         break;
+
+      case 0x03A: // cntlzd (Count Leading Zeros DWord, PPC64 p401)
+         if (rB_addr!=0) {
+            vex_printf("dis_int_logic(ppc)(cntlzd,rB_addr)\n");
+            return False;
+         }
+         DIP("cntlzd%s r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr);
+         // Iop_Clz64 undefined for arg==0, so deal with that case:
+         irx =  binop(Iop_CmpNE64, mkexpr(rS), mkU64(0));
+         assign(rA, IRExpr_Mux0X( unop(Iop_1Uto8, irx),
+                                  mkU64(64),
+                                  unop(Iop_Clz64, mkexpr(rS)) ));
+         // TODO: alternatively: assign(rA, verbose_Clz64(rS));
+         break;
+
+      case 0x1FC: // cmpb (Power6: compare bytes)
+         DIP("cmpb r%u,r%u,r%u\n", rA_addr, rS_addr, rB_addr);
+
+         if (mode64)
+            assign( rA, unop( Iop_V128to64,
+                              binop( Iop_CmpEQ8x16,
+                                     binop( Iop_64HLtoV128, mkU64(0), mkexpr(rS) ),
+                                     binop( Iop_64HLtoV128, mkU64(0), mkexpr(rB) )
+                                     )) );
+         else
+            assign( rA, unop( Iop_V128to32,
+                              binop( Iop_CmpEQ8x16,
+                                     unop( Iop_32UtoV128, mkexpr(rS) ),
+                                     unop( Iop_32UtoV128, mkexpr(rB) )
+                                     )) );
+         break;
+
+      case 0x2DF: { // mftgpr (move floating-point to general purpose register)
+         IRTemp frB = newTemp(Ity_F64);
+         DIP("mftgpr r%u,fr%u\n", rS_addr, rB_addr);
+
+         assign( frB, getFReg(rB_addr));  // always F64
+         if (mode64)
+            assign( rA, unop( Iop_ReinterpF64asI64, mkexpr(frB)) );
+         else
+            assign( rA, unop( Iop_64to32, unop( Iop_ReinterpF64asI64, mkexpr(frB))) );
+
+         putIReg( rS_addr, mkexpr(rA));
+         return True;
+      }
+
+      case 0x25F: { // mffgpr (move floating-point from general purpose register)
+         IRTemp frA = newTemp(Ity_F64);
+         DIP("mffgpr fr%u,r%u\n", rS_addr, rB_addr);
+
+         if (mode64)
+            assign( frA, unop( Iop_ReinterpI64asF64, mkexpr(rB)) );
+         else
+            assign( frA, unop( Iop_ReinterpI64asF64, unop( Iop_32Uto64, mkexpr(rB))) );
+
+         putFReg( rS_addr, mkexpr(frA));
+         return True;
+      }
+
+      default:
+         vex_printf("dis_int_logic(ppc)(opc2)\n");
+         return False;
+      }
+      break;
+      
+   default:
+      vex_printf("dis_int_logic(ppc)(opc1)\n");
+      return False;
+   }
+
+   putIReg( rA_addr, mkexpr(rA) );
+
+   if (do_rc && flag_rC) {
+      set_CR0( mkexpr(rA) );
+   }
+   return True;
+}
+
+/*
+  Integer Parity Instructions
+*/
+static Bool dis_int_parity ( UInt theInstr )
+{
+   /* X-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UChar rS_addr = ifieldRegDS(theInstr);
+   UChar rA_addr = ifieldRegA(theInstr);
+   UChar rB_addr = ifieldRegB(theInstr);
+   UInt  opc2    = ifieldOPClo10(theInstr);
+   UChar b0      = ifieldBIT0(theInstr);
+   IRType ty     = mode64 ? Ity_I64 : Ity_I32;
+
+   IRTemp rS     = newTemp(ty);
+   IRTemp rA     = newTemp(ty);
+   IRTemp iTot1  = newTemp(Ity_I32);
+   IRTemp iTot2  = newTemp(Ity_I32);
+   IRTemp iTot3  = newTemp(Ity_I32);
+   IRTemp iTot4  = newTemp(Ity_I32);
+   IRTemp iTot5  = newTemp(Ity_I32);
+   IRTemp iTot6  = newTemp(Ity_I32);
+   IRTemp iTot7  = newTemp(Ity_I32);
+   IRTemp iTot8  = newTemp(Ity_I32);
+   IRTemp rS1    = newTemp(ty);
+   IRTemp rS2    = newTemp(ty);
+   IRTemp rS3    = newTemp(ty);
+   IRTemp rS4    = newTemp(ty);
+   IRTemp rS5    = newTemp(ty);
+   IRTemp rS6    = newTemp(ty);
+   IRTemp rS7    = newTemp(ty);
+   IRTemp iHi    = newTemp(Ity_I32);
+   IRTemp iLo    = newTemp(Ity_I32);
+   IROp to_bit   = (mode64 ? Iop_64to1 : Iop_32to1);
+   IROp shr_op   = (mode64 ? Iop_Shr64 : Iop_Shr32);
+
+   if (opc1 != 0x1f || rB_addr || b0) {
+      vex_printf("dis_int_parity(ppc)(0x1F,opc1:rB|b0)\n");
+      return False;
+   }
+
+   assign( rS, getIReg(rS_addr) );
+
+   switch (opc2) {
+   case 0xba:  // prtyd (Parity Doubleword, ISA 2.05 p320)
+      DIP("prtyd r%u,r%u\n", rA_addr, rS_addr);
+      assign( iTot1, unop(Iop_1Uto32, unop(to_bit, mkexpr(rS))) );
+      assign( rS1, binop(shr_op, mkexpr(rS), mkU8(8)) );
+      assign( iTot2, binop(Iop_Add32,
+                           unop(Iop_1Uto32, unop(to_bit, mkexpr(rS1))),
+                           mkexpr(iTot1)) );
+      assign( rS2, binop(shr_op, mkexpr(rS1), mkU8(8)) );
+      assign( iTot3, binop(Iop_Add32,
+                           unop(Iop_1Uto32, unop(to_bit, mkexpr(rS2))),
+                           mkexpr(iTot2)) );
+      assign( rS3, binop(shr_op, mkexpr(rS2), mkU8(8)) );
+      assign( iTot4, binop(Iop_Add32,
+                           unop(Iop_1Uto32, unop(to_bit, mkexpr(rS3))),
+                           mkexpr(iTot3)) );
+      if (mode64) {
+         assign( rS4, binop(shr_op, mkexpr(rS3), mkU8(8)) );
+         assign( iTot5, binop(Iop_Add32,
+                              unop(Iop_1Uto32, unop(to_bit, mkexpr(rS4))),
+                              mkexpr(iTot4)) );
+         assign( rS5, binop(shr_op, mkexpr(rS4), mkU8(8)) );
+         assign( iTot6, binop(Iop_Add32,
+                              unop(Iop_1Uto32, unop(to_bit, mkexpr(rS5))),
+                              mkexpr(iTot5)) );
+         assign( rS6, binop(shr_op, mkexpr(rS5), mkU8(8)) );
+         assign( iTot7, binop(Iop_Add32,
+                              unop(Iop_1Uto32, unop(to_bit, mkexpr(rS6))),
+                              mkexpr(iTot6)) );
+         assign( rS7, binop(shr_op, mkexpr(rS6), mkU8(8)) );
+         assign( iTot8, binop(Iop_Add32,
+                              unop(Iop_1Uto32, unop(to_bit, mkexpr(rS7))),
+                              mkexpr(iTot7)) );
+         assign( rA, unop(Iop_32Uto64,
+                          binop(Iop_And32, mkexpr(iTot8), mkU32(1))) );
+      } else
+         assign( rA, mkexpr(iTot4) );
+
+      break;
+   case 0x9a:  // prtyw (Parity Word, ISA 2.05 p320)
+      assign( iTot1, unop(Iop_1Uto32, unop(to_bit, mkexpr(rS))) );
+      assign( rS1, binop(shr_op, mkexpr(rS), mkU8(8)) );
+      assign( iTot2, binop(Iop_Add32,
+                           unop(Iop_1Uto32, unop(to_bit, mkexpr(rS1))),
+                           mkexpr(iTot1)) );
+      assign( rS2, binop(shr_op, mkexpr(rS1), mkU8(8)) );
+      assign( iTot3, binop(Iop_Add32,
+                           unop(Iop_1Uto32, unop(to_bit, mkexpr(rS2))),
+                           mkexpr(iTot2)) );
+      assign( rS3, binop(shr_op, mkexpr(rS2), mkU8(8)) );
+      assign( iTot4, binop(Iop_Add32,
+                           unop(Iop_1Uto32, unop(to_bit, mkexpr(rS3))),
+                           mkexpr(iTot3)) );
+      assign( iLo, unop(Iop_1Uto32, unop(Iop_32to1, mkexpr(iTot4) )) );
+
+      if (mode64) {
+         assign( rS4, binop(shr_op, mkexpr(rS3), mkU8(8)) );
+         assign( iTot5, unop(Iop_1Uto32, unop(to_bit, mkexpr(rS4))) );
+         assign( rS5, binop(shr_op, mkexpr(rS4), mkU8(8)) );
+         assign( iTot6, binop(Iop_Add32,
+                              unop(Iop_1Uto32, unop(to_bit, mkexpr(rS5))),
+                              mkexpr(iTot5)) );
+         assign( rS6, binop(shr_op, mkexpr(rS5), mkU8(8)) );
+         assign( iTot7, binop(Iop_Add32,
+                              unop(Iop_1Uto32, unop(to_bit, mkexpr(rS6))),
+                              mkexpr(iTot6)) );
+         assign( rS7, binop(shr_op, mkexpr(rS6), mkU8(8)));
+         assign( iTot8, binop(Iop_Add32,
+                              unop(Iop_1Uto32, unop(to_bit, mkexpr(rS7))),
+                              mkexpr(iTot7)) );
+         assign( iHi, binop(Iop_And32, mkU32(1), mkexpr(iTot8)) ),
+            assign( rA, binop(Iop_32HLto64, mkexpr(iHi), mkexpr(iLo)) );
+      } else
+         assign( rA, binop(Iop_Or32, mkU32(0), mkexpr(iLo)) );
+      break;
+   default:
+      vex_printf("dis_int_parity(ppc)(opc2)\n");
+      return False;
+   }
+
+   putIReg( rA_addr, mkexpr(rA) );
+
+   return True;
+}
+
+
+/*
+  Integer Rotate Instructions
+*/
+static Bool dis_int_rot ( UInt theInstr )
+{
+   /* M-Form, MDS-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UChar rS_addr = ifieldRegDS(theInstr);
+   UChar rA_addr = ifieldRegA(theInstr);
+   UChar rB_addr = ifieldRegB(theInstr);
+   UChar sh_imm  = rB_addr;
+   UChar MaskBeg = toUChar( IFIELD( theInstr, 6, 5 ) );
+   UChar MaskEnd = toUChar( IFIELD( theInstr, 1, 5 ) );
+   UChar msk_imm = toUChar( IFIELD( theInstr, 5, 6 ) );
+   UChar opc2    = toUChar( IFIELD( theInstr, 2, 3 ) );
+   UChar b1      = ifieldBIT1(theInstr);
+   UChar flag_rC = ifieldBIT0(theInstr);
+
+   IRType ty     = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp rS     = newTemp(ty);
+   IRTemp rA     = newTemp(ty);
+   IRTemp rB     = newTemp(ty);
+   IRTemp rot    = newTemp(ty);
+   IRExpr *r;
+   UInt   mask32;
+   ULong  mask64;
+
+   assign( rS, getIReg(rS_addr) );
+   assign( rB, getIReg(rB_addr) );
+
+   switch (opc1) {
+   case 0x14: {
+      // rlwimi (Rotate Left Word Imm then Mask Insert, PPC32 p500)
+      DIP("rlwimi%s r%u,r%u,%d,%d,%d\n", flag_rC ? ".":"",
+          rA_addr, rS_addr, sh_imm, MaskBeg, MaskEnd);
+      if (mode64) {
+         // tmp32 = (ROTL(rS_Lo32, Imm)
+         // rA = ((tmp32 || tmp32) & mask64) | (rA & ~mask64)
+         mask64 = MASK64(31-MaskEnd, 31-MaskBeg);
+         r = ROTL( unop(Iop_64to32, mkexpr(rS) ), mkU8(sh_imm) );
+         r = unop(Iop_32Uto64, r);
+         assign( rot, binop(Iop_Or64, r,
+                            binop(Iop_Shl64, r, mkU8(32))) );
+         assign( rA,
+            binop(Iop_Or64,
+                  binop(Iop_And64, mkexpr(rot), mkU64(mask64)),
+                  binop(Iop_And64, getIReg(rA_addr), mkU64(~mask64))) );
+      }
+      else {
+         // rA = (ROTL(rS, Imm) & mask) | (rA & ~mask);
+         mask32 = MASK32(31-MaskEnd, 31-MaskBeg);
+         r = ROTL(mkexpr(rS), mkU8(sh_imm));
+         assign( rA,
+            binop(Iop_Or32,
+                  binop(Iop_And32, mkU32(mask32), r),
+                  binop(Iop_And32, getIReg(rA_addr), mkU32(~mask32))) );
+      }
+      break;
+   }
+
+   case 0x15: {
+      // rlwinm (Rotate Left Word Imm then AND with Mask, PPC32 p501)
+      vassert(MaskBeg < 32);
+      vassert(MaskEnd < 32);
+      vassert(sh_imm  < 32);
+
+      if (mode64) {
+         IRTemp rTmp = newTemp(Ity_I64);
+         mask64 = MASK64(31-MaskEnd, 31-MaskBeg);
+         DIP("rlwinm%s r%u,r%u,%d,%d,%d\n", flag_rC ? ".":"",
+             rA_addr, rS_addr, sh_imm, MaskBeg, MaskEnd);
+         // tmp32 = (ROTL(rS_Lo32, Imm)
+         // rA = ((tmp32 || tmp32) & mask64)
+         r = ROTL( unop(Iop_64to32, mkexpr(rS) ), mkU8(sh_imm) );
+         r = unop(Iop_32Uto64, r);
+         assign( rTmp, r );
+         r = NULL;
+         assign( rot, binop(Iop_Or64, mkexpr(rTmp),
+                            binop(Iop_Shl64, mkexpr(rTmp), mkU8(32))) );
+         assign( rA, binop(Iop_And64, mkexpr(rot), mkU64(mask64)) );
+      }
+      else {
+         if (MaskBeg == 0 && sh_imm+MaskEnd == 31) {
+            /* Special-case the ,n,0,31-n form as that is just n-bit
+               shift left, PPC32 p501 */
+            DIP("slwi%s r%u,r%u,%d\n", flag_rC ? ".":"",
+                rA_addr, rS_addr, sh_imm);
+            assign( rA, binop(Iop_Shl32, mkexpr(rS), mkU8(sh_imm)) );
+         }
+         else if (MaskEnd == 31 && sh_imm+MaskBeg == 32) {
+            /* Special-case the ,32-n,n,31 form as that is just n-bit
+               unsigned shift right, PPC32 p501 */
+            DIP("srwi%s r%u,r%u,%d\n", flag_rC ? ".":"",
+                rA_addr, rS_addr, MaskBeg);
+            assign( rA, binop(Iop_Shr32, mkexpr(rS), mkU8(MaskBeg)) );
+         }
+         else {
+            /* General case. */
+            mask32 = MASK32(31-MaskEnd, 31-MaskBeg);
+            DIP("rlwinm%s r%u,r%u,%d,%d,%d\n", flag_rC ? ".":"",
+                rA_addr, rS_addr, sh_imm, MaskBeg, MaskEnd);
+            // rA = ROTL(rS, Imm) & mask
+            assign( rA, binop(Iop_And32,
+                              ROTL(mkexpr(rS), mkU8(sh_imm)), 
+                              mkU32(mask32)) );
+         }
+      }
+      break;
+   }
+
+   case 0x17: {
+      // rlwnm (Rotate Left Word then AND with Mask, PPC32 p503
+      DIP("rlwnm%s r%u,r%u,r%u,%d,%d\n", flag_rC ? ".":"",
+          rA_addr, rS_addr, rB_addr, MaskBeg, MaskEnd);
+      if (mode64) {
+         mask64 = MASK64(31-MaskEnd, 31-MaskBeg);
+         /* weird insn alert!
+            tmp32 = (ROTL(rS_Lo32, rB[0-4])
+            rA = ((tmp32 || tmp32) & mask64)
+         */
+         // note, ROTL does the masking, so we don't do it here
+         r = ROTL( unop(Iop_64to32, mkexpr(rS)),
+                   unop(Iop_64to8, mkexpr(rB)) );
+         r = unop(Iop_32Uto64, r);
+         assign(rot, binop(Iop_Or64, r, binop(Iop_Shl64, r, mkU8(32))));
+         assign( rA, binop(Iop_And64, mkexpr(rot), mkU64(mask64)) );
+      } else {
+         mask32 = MASK32(31-MaskEnd, 31-MaskBeg);
+         // rA = ROTL(rS, rB[0-4]) & mask
+         // note, ROTL does the masking, so we don't do it here
+         assign( rA, binop(Iop_And32,
+                           ROTL(mkexpr(rS),
+                                unop(Iop_32to8, mkexpr(rB))),
+                           mkU32(mask32)) );
+      }
+      break;
+   }
+
+   /* 64bit Integer Rotates */
+   case 0x1E: {
+      msk_imm = ((msk_imm & 1) << 5) | (msk_imm >> 1);
+      sh_imm |= b1 << 5;
+
+      vassert( msk_imm < 64 );
+      vassert( sh_imm < 64 );
+
+      switch (opc2) {
+      case 0x4: {
+         /* r = ROTL64( rS, rB_lo6) */
+         r = ROTL( mkexpr(rS), unop(Iop_64to8, mkexpr(rB)) );
+
+         if (b1 == 0) { // rldcl (Rotl DWord, Clear Left, PPC64 p555)
+            DIP("rldcl%s r%u,r%u,r%u,%u\n", flag_rC ? ".":"",
+                rA_addr, rS_addr, rB_addr, msk_imm);
+            // note, ROTL does the masking, so we don't do it here
+            mask64 = MASK64(0, 63-msk_imm);
+            assign( rA, binop(Iop_And64, r, mkU64(mask64)) );
+            break;
+         } else {       // rldcr (Rotl DWord, Clear Right, PPC64 p556)
+            DIP("rldcr%s r%u,r%u,r%u,%u\n", flag_rC ? ".":"",
+                rA_addr, rS_addr, rB_addr, msk_imm);
+            mask64 = MASK64(63-msk_imm, 63);
+            assign( rA, binop(Iop_And64, r, mkU64(mask64)) );
+            break;
+         }
+         break;
+      }
+      case 0x2: // rldic (Rotl DWord Imm, Clear, PPC64 p557)
+         DIP("rldic%s r%u,r%u,%u,%u\n", flag_rC ? ".":"",
+             rA_addr, rS_addr, sh_imm, msk_imm);
+         r = ROTL(mkexpr(rS), mkU8(sh_imm));
+         mask64 = MASK64(sh_imm, 63-msk_imm);
+         assign( rA, binop(Iop_And64, r, mkU64(mask64)) );
+         break;
+         // later: deal with special case: (msk_imm==0) => SHL(sh_imm)
+         /*
+           Hmm... looks like this'll do the job more simply:
+           r = SHL(rS, sh_imm)
+           m = ~(1 << (63-msk_imm))
+           assign(rA, r & m);
+         */
+         
+      case 0x0: // rldicl (Rotl DWord Imm, Clear Left, PPC64 p558)
+         if (mode64
+             && sh_imm + msk_imm == 64 && msk_imm >= 1 && msk_imm <= 63) {
+            /* special-case the ,64-n,n form as that is just
+               unsigned shift-right by n */
+            DIP("srdi%s r%u,r%u,%u\n",
+                flag_rC ? ".":"", rA_addr, rS_addr, msk_imm);
+            assign( rA, binop(Iop_Shr64, mkexpr(rS), mkU8(msk_imm)) );
+         } else {
+            DIP("rldicl%s r%u,r%u,%u,%u\n", flag_rC ? ".":"",
+                rA_addr, rS_addr, sh_imm, msk_imm);
+            r = ROTL(mkexpr(rS), mkU8(sh_imm));
+            mask64 = MASK64(0, 63-msk_imm);
+            assign( rA, binop(Iop_And64, r, mkU64(mask64)) );
+         }
+         break;
+         
+      case 0x1: // rldicr (Rotl DWord Imm, Clear Right, PPC64 p559)
+         if (mode64 
+             && sh_imm + msk_imm == 63 && sh_imm >= 1 && sh_imm <= 63) {
+            /* special-case the ,n,63-n form as that is just
+               shift-left by n */
+            DIP("sldi%s r%u,r%u,%u\n",
+                flag_rC ? ".":"", rA_addr, rS_addr, sh_imm);
+            assign( rA, binop(Iop_Shl64, mkexpr(rS), mkU8(sh_imm)) );
+         } else {
+            DIP("rldicr%s r%u,r%u,%u,%u\n", flag_rC ? ".":"",
+                rA_addr, rS_addr, sh_imm, msk_imm);
+            r = ROTL(mkexpr(rS), mkU8(sh_imm));
+            mask64 = MASK64(63-msk_imm, 63);
+            assign( rA, binop(Iop_And64, r, mkU64(mask64)) );
+         }
+         break;
+         
+      case 0x3: { // rldimi (Rotl DWord Imm, Mask Insert, PPC64 p560)
+         IRTemp rA_orig = newTemp(ty);
+         DIP("rldimi%s r%u,r%u,%u,%u\n", flag_rC ? ".":"",
+             rA_addr, rS_addr, sh_imm, msk_imm);
+         r = ROTL(mkexpr(rS), mkU8(sh_imm));
+         mask64 = MASK64(sh_imm, 63-msk_imm);
+         assign( rA_orig, getIReg(rA_addr) );
+         assign( rA, binop(Iop_Or64,
+                           binop(Iop_And64, mkU64(mask64),  r),
+                           binop(Iop_And64, mkU64(~mask64),
+                                            mkexpr(rA_orig))) );
+         break;
+      }
+      default:
+         vex_printf("dis_int_rot(ppc)(opc2)\n");
+         return False;
+      }
+      break;         
+   }
+
+   default:
+      vex_printf("dis_int_rot(ppc)(opc1)\n");
+      return False;
+   }
+
+   putIReg( rA_addr, mkexpr(rA) );
+
+   if (flag_rC) {
+      set_CR0( mkexpr(rA) );
+   }
+   return True;
+}
+
+
+/*
+  Integer Load Instructions
+*/
+static Bool dis_int_load ( UInt theInstr )
+{
+   /* D-Form, X-Form, DS-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar rD_addr  = ifieldRegDS(theInstr);
+   UChar rA_addr  = ifieldRegA(theInstr);
+   UInt  uimm16   = ifieldUIMM16(theInstr);
+   UChar rB_addr  = ifieldRegB(theInstr);
+   UInt  opc2     = ifieldOPClo10(theInstr);
+   UChar b1       = ifieldBIT1(theInstr);
+   UChar b0       = ifieldBIT0(theInstr);
+
+   Int     simm16 = extend_s_16to32(uimm16);
+   IRType  ty     = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp  EA     = newTemp(ty);
+   IRExpr* val;
+
+   switch (opc1) {
+   case 0x1F: // register offset
+      assign( EA, ea_rAor0_idxd( rA_addr, rB_addr ) );
+      break;
+   case 0x3A: // immediate offset: 64bit: ld/ldu/lwa: mask off
+              // lowest 2 bits of immediate before forming EA
+      simm16 = simm16 & 0xFFFFFFFC;
+   default:   // immediate offset
+      assign( EA, ea_rAor0_simm( rA_addr, simm16  ) );
+      break;
+   }
+
+   switch (opc1) {
+   case 0x22: // lbz (Load B & Zero, PPC32 p433)
+      DIP("lbz r%u,%d(r%u)\n", rD_addr, (Int)simm16, rA_addr);
+      val = loadBE(Ity_I8, mkexpr(EA));
+      putIReg( rD_addr, mkWidenFrom8(ty, val, False) );
+      break;
+      
+   case 0x23: // lbzu (Load B & Zero, Update, PPC32 p434)
+      if (rA_addr == 0 || rA_addr == rD_addr) {
+         vex_printf("dis_int_load(ppc)(lbzu,rA_addr|rD_addr)\n");
+         return False;
+      }
+      DIP("lbzu r%u,%d(r%u)\n", rD_addr, (Int)simm16, rA_addr);
+      val = loadBE(Ity_I8, mkexpr(EA));
+      putIReg( rD_addr, mkWidenFrom8(ty, val, False) );
+      putIReg( rA_addr, mkexpr(EA) );
+      break;
+      
+   case 0x2A: // lha (Load HW Alg, PPC32 p445)
+      DIP("lha r%u,%d(r%u)\n", rD_addr, (Int)simm16, rA_addr);
+      val = loadBE(Ity_I16, mkexpr(EA));
+      putIReg( rD_addr, mkWidenFrom16(ty, val, True) );
+      break;
+
+   case 0x2B: // lhau (Load HW Alg, Update, PPC32 p446)
+      if (rA_addr == 0 || rA_addr == rD_addr) {
+         vex_printf("dis_int_load(ppc)(lhau,rA_addr|rD_addr)\n");
+         return False;
+      }
+      DIP("lhau r%u,%d(r%u)\n", rD_addr, (Int)simm16, rA_addr);
+      val = loadBE(Ity_I16, mkexpr(EA));
+      putIReg( rD_addr, mkWidenFrom16(ty, val, True) );
+      putIReg( rA_addr, mkexpr(EA) );
+      break;
+      
+   case 0x28: // lhz (Load HW & Zero, PPC32 p450)
+      DIP("lhz r%u,%d(r%u)\n", rD_addr, (Int)simm16, rA_addr);
+      val = loadBE(Ity_I16, mkexpr(EA));
+      putIReg( rD_addr, mkWidenFrom16(ty, val, False) );
+      break;
+      
+   case 0x29: // lhzu (Load HW & and Zero, Update, PPC32 p451)
+      if (rA_addr == 0 || rA_addr == rD_addr) {
+         vex_printf("dis_int_load(ppc)(lhzu,rA_addr|rD_addr)\n");
+         return False;
+      }
+      DIP("lhzu r%u,%d(r%u)\n", rD_addr, (Int)simm16, rA_addr);
+      val = loadBE(Ity_I16, mkexpr(EA));
+      putIReg( rD_addr, mkWidenFrom16(ty, val, False) );
+      putIReg( rA_addr, mkexpr(EA) );
+      break;
+
+   case 0x20: // lwz (Load W & Zero, PPC32 p460)
+      DIP("lwz r%u,%d(r%u)\n", rD_addr, (Int)simm16, rA_addr);
+      val = loadBE(Ity_I32, mkexpr(EA));
+      putIReg( rD_addr, mkWidenFrom32(ty, val, False) );
+      break;
+      
+   case 0x21: // lwzu (Load W & Zero, Update, PPC32 p461))
+      if (rA_addr == 0 || rA_addr == rD_addr) {
+         vex_printf("dis_int_load(ppc)(lwzu,rA_addr|rD_addr)\n");
+         return False;
+      }
+      DIP("lwzu r%u,%d(r%u)\n", rD_addr, (Int)simm16, rA_addr);
+      val = loadBE(Ity_I32, mkexpr(EA));
+      putIReg( rD_addr, mkWidenFrom32(ty, val, False) );
+      putIReg( rA_addr, mkexpr(EA) );
+      break;
+      
+   /* X Form */
+   case 0x1F:
+      if (b0 != 0) {
+         vex_printf("dis_int_load(ppc)(Ox1F,b0)\n");
+         return False;
+      }
+
+      switch (opc2) {
+      case 0x077: // lbzux (Load B & Zero, Update Indexed, PPC32 p435)
+         DIP("lbzux r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         if (rA_addr == 0 || rA_addr == rD_addr) {
+            vex_printf("dis_int_load(ppc)(lwzux,rA_addr|rD_addr)\n");
+            return False;
+         }
+         val = loadBE(Ity_I8, mkexpr(EA));
+         putIReg( rD_addr, mkWidenFrom8(ty, val, False) );
+         putIReg( rA_addr, mkexpr(EA) );
+         break;
+         
+      case 0x057: // lbzx (Load B & Zero, Indexed, PPC32 p436)
+         DIP("lbzx r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         val = loadBE(Ity_I8, mkexpr(EA));
+         putIReg( rD_addr, mkWidenFrom8(ty, val, False) );
+         break;
+         
+      case 0x177: // lhaux (Load HW Alg, Update Indexed, PPC32 p447)
+         if (rA_addr == 0 || rA_addr == rD_addr) {
+            vex_printf("dis_int_load(ppc)(lhaux,rA_addr|rD_addr)\n");
+            return False;
+         }
+         DIP("lhaux r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         val = loadBE(Ity_I16, mkexpr(EA));
+         putIReg( rD_addr, mkWidenFrom16(ty, val, True) );
+         putIReg( rA_addr, mkexpr(EA) );
+         break;
+         
+      case 0x157: // lhax (Load HW Alg, Indexed, PPC32 p448)
+         DIP("lhax r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         val = loadBE(Ity_I16, mkexpr(EA));
+         putIReg( rD_addr, mkWidenFrom16(ty, val, True) );
+         break;
+         
+      case 0x137: // lhzux (Load HW & Zero, Update Indexed, PPC32 p452)
+         if (rA_addr == 0 || rA_addr == rD_addr) {
+            vex_printf("dis_int_load(ppc)(lhzux,rA_addr|rD_addr)\n");
+            return False;
+         }
+         DIP("lhzux r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         val = loadBE(Ity_I16, mkexpr(EA));
+         putIReg( rD_addr, mkWidenFrom16(ty, val, False) );
+         putIReg( rA_addr, mkexpr(EA) );
+         break;
+         
+      case 0x117: // lhzx (Load HW & Zero, Indexed, PPC32 p453)
+         DIP("lhzx r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         val = loadBE(Ity_I16, mkexpr(EA));
+         putIReg( rD_addr, mkWidenFrom16(ty, val, False) );
+         break;
+
+      case 0x037: // lwzux (Load W & Zero, Update Indexed, PPC32 p462)
+         if (rA_addr == 0 || rA_addr == rD_addr) {
+            vex_printf("dis_int_load(ppc)(lwzux,rA_addr|rD_addr)\n");
+            return False;
+         }
+         DIP("lwzux r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         val = loadBE(Ity_I32, mkexpr(EA));
+         putIReg( rD_addr, mkWidenFrom32(ty, val, False) );
+         putIReg( rA_addr, mkexpr(EA) );
+         break;
+         
+      case 0x017: // lwzx (Load W & Zero, Indexed, PPC32 p463)
+         DIP("lwzx r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         val = loadBE(Ity_I32, mkexpr(EA));
+         putIReg( rD_addr, mkWidenFrom32(ty, val, False) );
+         break;
+
+
+      /* 64bit Loads */
+      case 0x035: // ldux (Load DWord, Update Indexed, PPC64 p475)
+         if (rA_addr == 0 || rA_addr == rD_addr) {
+            vex_printf("dis_int_load(ppc)(ldux,rA_addr|rD_addr)\n");
+            return False;
+         }
+         DIP("ldux r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         putIReg( rD_addr, loadBE(Ity_I64, mkexpr(EA)) );
+         putIReg( rA_addr, mkexpr(EA) );
+         break;
+
+      case 0x015: // ldx (Load DWord, Indexed, PPC64 p476)
+         DIP("ldx r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         putIReg( rD_addr, loadBE(Ity_I64, mkexpr(EA)) );
+         break;
+
+      case 0x175: // lwaux (Load W Alg, Update Indexed, PPC64 p501)
+         if (rA_addr == 0 || rA_addr == rD_addr) {
+            vex_printf("dis_int_load(ppc)(lwaux,rA_addr|rD_addr)\n");
+            return False;
+         }
+         DIP("lwaux r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         putIReg( rD_addr,
+                  unop(Iop_32Sto64, loadBE(Ity_I32, mkexpr(EA))) );
+         putIReg( rA_addr, mkexpr(EA) );
+         break;
+
+      case 0x155: // lwax (Load W Alg, Indexed, PPC64 p502)
+         DIP("lwax r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         putIReg( rD_addr,
+                  unop(Iop_32Sto64, loadBE(Ity_I32, mkexpr(EA))) );
+         break;
+
+      default:
+         vex_printf("dis_int_load(ppc)(opc2)\n");
+         return False;
+      }
+      break;
+
+   /* DS Form - 64bit Loads.  In each case EA will have been formed
+      with the lowest 2 bits masked off the immediate offset. */
+   case 0x3A:
+      switch ((b1<<1) | b0) {
+      case 0x0: // ld (Load DWord, PPC64 p472)
+         DIP("ld r%u,%d(r%u)\n", rD_addr, simm16, rA_addr);
+         putIReg( rD_addr, loadBE(Ity_I64, mkexpr(EA)) );
+         break;
+
+      case 0x1: // ldu (Load DWord, Update, PPC64 p474)
+         if (rA_addr == 0 || rA_addr == rD_addr) {
+            vex_printf("dis_int_load(ppc)(ldu,rA_addr|rD_addr)\n");
+            return False;
+         }
+         DIP("ldu r%u,%d(r%u)\n", rD_addr, simm16, rA_addr);
+         putIReg( rD_addr, loadBE(Ity_I64, mkexpr(EA)) );
+         putIReg( rA_addr, mkexpr(EA) );
+         break;
+
+      case 0x2: // lwa (Load Word Alg, PPC64 p499)
+         DIP("lwa r%u,%d(r%u)\n", rD_addr, simm16, rA_addr);
+         putIReg( rD_addr,
+                  unop(Iop_32Sto64, loadBE(Ity_I32, mkexpr(EA))) );
+         break;
+
+      default:
+         vex_printf("dis_int_load(ppc)(0x3A, opc2)\n");
+         return False;
+      }
+      break;
+
+   default:
+      vex_printf("dis_int_load(ppc)(opc1)\n");
+      return False;
+   }
+   return True;
+}
+
+
+
+/*
+  Integer Store Instructions
+*/
+static Bool dis_int_store ( UInt theInstr, VexAbiInfo* vbi )
+{
+   /* D-Form, X-Form, DS-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UInt  rS_addr = ifieldRegDS(theInstr);
+   UInt  rA_addr = ifieldRegA(theInstr);
+   UInt  uimm16  = ifieldUIMM16(theInstr);
+   UInt  rB_addr = ifieldRegB(theInstr);
+   UInt  opc2    = ifieldOPClo10(theInstr);
+   UChar b1      = ifieldBIT1(theInstr);
+   UChar b0      = ifieldBIT0(theInstr);
+
+   Int    simm16 = extend_s_16to32(uimm16);
+   IRType ty     = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp rS     = newTemp(ty);
+   IRTemp rB     = newTemp(ty);
+   IRTemp EA     = newTemp(ty);
+   
+   assign( rB, getIReg(rB_addr) );
+   assign( rS, getIReg(rS_addr) );
+   
+   switch (opc1) {
+   case 0x1F: // register offset
+      assign( EA, ea_rAor0_idxd( rA_addr, rB_addr ) );
+      break;
+   case 0x3E: // immediate offset: 64bit: std/stdu: mask off
+              // lowest 2 bits of immediate before forming EA
+      simm16 = simm16 & 0xFFFFFFFC;
+   default:   // immediate offset
+      assign( EA, ea_rAor0_simm( rA_addr, simm16  ) );
+      break;
+   }
+
+   switch (opc1) {
+   case 0x26: // stb (Store B, PPC32 p509)
+      DIP("stb r%u,%d(r%u)\n", rS_addr, simm16, rA_addr);
+      storeBE( mkexpr(EA), mkNarrowTo8(ty, mkexpr(rS)) );
+      break;
+       
+   case 0x27: // stbu (Store B, Update, PPC32 p510)
+      if (rA_addr == 0 ) {
+         vex_printf("dis_int_store(ppc)(stbu,rA_addr)\n");
+         return False;
+      }
+      DIP("stbu r%u,%d(r%u)\n", rS_addr, simm16, rA_addr);
+      putIReg( rA_addr, mkexpr(EA) );
+      storeBE( mkexpr(EA), mkNarrowTo8(ty, mkexpr(rS)) );
+      break;
+
+   case 0x2C: // sth (Store HW, PPC32 p522)
+      DIP("sth r%u,%d(r%u)\n", rS_addr, simm16, rA_addr);
+      storeBE( mkexpr(EA), mkNarrowTo16(ty, mkexpr(rS)) );
+      break;
+      
+   case 0x2D: // sthu (Store HW, Update, PPC32 p524)
+      if (rA_addr == 0) {
+         vex_printf("dis_int_store(ppc)(sthu,rA_addr)\n");
+         return False;
+      }
+      DIP("sthu r%u,%d(r%u)\n", rS_addr, simm16, rA_addr);
+      putIReg( rA_addr, mkexpr(EA) );
+      storeBE( mkexpr(EA), mkNarrowTo16(ty, mkexpr(rS)) );
+      break;
+
+   case 0x24: // stw (Store W, PPC32 p530)
+      DIP("stw r%u,%d(r%u)\n", rS_addr, simm16, rA_addr);
+      storeBE( mkexpr(EA), mkNarrowTo32(ty, mkexpr(rS)) );
+      break;
+
+   case 0x25: // stwu (Store W, Update, PPC32 p534)
+      if (rA_addr == 0) {
+         vex_printf("dis_int_store(ppc)(stwu,rA_addr)\n");
+         return False;
+      }
+      DIP("stwu r%u,%d(r%u)\n", rS_addr, simm16, rA_addr);
+      putIReg( rA_addr, mkexpr(EA) );
+      storeBE( mkexpr(EA), mkNarrowTo32(ty, mkexpr(rS)) );
+      break;
+      
+   /* X Form : all these use EA_indexed */
+   case 0x1F:
+      if (b0 != 0) {
+         vex_printf("dis_int_store(ppc)(0x1F,b0)\n");
+         return False;
+      }
+
+      switch (opc2) {
+      case 0x0F7: // stbux (Store B, Update Indexed, PPC32 p511)
+         if (rA_addr == 0) {
+            vex_printf("dis_int_store(ppc)(stbux,rA_addr)\n");
+            return False;
+         }
+         DIP("stbux r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+         putIReg( rA_addr, mkexpr(EA) );
+         storeBE( mkexpr(EA), mkNarrowTo8(ty, mkexpr(rS)) );
+         break;
+         
+      case 0x0D7: // stbx (Store B Indexed, PPC32 p512)
+         DIP("stbx r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+         storeBE( mkexpr(EA), mkNarrowTo8(ty, mkexpr(rS)) );
+         break;
+         
+      case 0x1B7: // sthux (Store HW, Update Indexed, PPC32 p525)
+         if (rA_addr == 0) {
+            vex_printf("dis_int_store(ppc)(sthux,rA_addr)\n");
+            return False;
+         }
+         DIP("sthux r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+         putIReg( rA_addr, mkexpr(EA) );
+         storeBE( mkexpr(EA), mkNarrowTo16(ty, mkexpr(rS)) );
+         break;
+         
+      case 0x197: // sthx (Store HW Indexed, PPC32 p526)
+         DIP("sthx r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+         storeBE( mkexpr(EA), mkNarrowTo16(ty, mkexpr(rS)) );
+         break;
+         
+      case 0x0B7: // stwux (Store W, Update Indexed, PPC32 p535)
+         if (rA_addr == 0) {
+            vex_printf("dis_int_store(ppc)(stwux,rA_addr)\n");
+            return False;
+         }
+         DIP("stwux r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+         putIReg( rA_addr, mkexpr(EA) );
+         storeBE( mkexpr(EA), mkNarrowTo32(ty, mkexpr(rS)) );
+         break;
+
+      case 0x097: // stwx (Store W Indexed, PPC32 p536)
+         DIP("stwx r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+         storeBE( mkexpr(EA), mkNarrowTo32(ty, mkexpr(rS)) );
+         break;
+         
+
+      /* 64bit Stores */
+      case 0x0B5: // stdux (Store DWord, Update Indexed, PPC64 p584)
+         if (rA_addr == 0) {
+            vex_printf("dis_int_store(ppc)(stdux,rA_addr)\n");
+            return False;
+         }
+         DIP("stdux r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+         putIReg( rA_addr, mkexpr(EA) );
+         storeBE( mkexpr(EA), mkexpr(rS) );
+         break;
+
+      case 0x095: // stdx (Store DWord Indexed, PPC64 p585)
+         DIP("stdx r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+         storeBE( mkexpr(EA), mkexpr(rS) );
+         break;
+
+      default:
+         vex_printf("dis_int_store(ppc)(opc2)\n");
+         return False;
+      }
+      break;
+
+   /* DS Form - 64bit Stores.  In each case EA will have been formed
+      with the lowest 2 bits masked off the immediate offset. */
+   case 0x3E:
+      switch ((b1<<1) | b0) {
+      case 0x0: // std (Store DWord, PPC64 p580)
+         DIP("std r%u,%d(r%u)\n", rS_addr, simm16, rA_addr);
+         storeBE( mkexpr(EA), mkexpr(rS) );
+         break;
+
+      case 0x1: // stdu (Store DWord, Update, PPC64 p583)
+         DIP("stdu r%u,%d(r%u)\n", rS_addr, simm16, rA_addr);
+         putIReg( rA_addr, mkexpr(EA) );
+         storeBE( mkexpr(EA), mkexpr(rS) );
+         break;
+
+      default:
+         vex_printf("dis_int_load(ppc)(0x3A, opc2)\n");
+         return False;
+      }
+      break;
+
+   default:
+      vex_printf("dis_int_store(ppc)(opc1)\n");
+      return False;
+   }
+   return True;
+}
+
+
+
+/*
+  Integer Load/Store Multiple Instructions
+*/
+static Bool dis_int_ldst_mult ( UInt theInstr )
+{
+   /* D-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar rD_addr  = ifieldRegDS(theInstr);
+   UChar rS_addr  = rD_addr;
+   UChar rA_addr  = ifieldRegA(theInstr);
+   UInt  uimm16   = ifieldUIMM16(theInstr);
+
+   Int     simm16 = extend_s_16to32(uimm16);
+   IRType  ty     = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp  EA     = newTemp(ty);
+   UInt    r      = 0;
+   UInt    ea_off = 0;
+   IRExpr* irx_addr;
+
+   assign( EA, ea_rAor0_simm( rA_addr, simm16 ) );
+
+   switch (opc1) {
+   case 0x2E: // lmw (Load Multiple Word, PPC32 p454)
+      if (rA_addr >= rD_addr) {
+         vex_printf("dis_int_ldst_mult(ppc)(lmw,rA_addr)\n");
+         return False;
+      }
+      DIP("lmw r%u,%d(r%u)\n", rD_addr, simm16, rA_addr);
+      for (r = rD_addr; r <= 31; r++) {
+         irx_addr = binop(Iop_Add32, mkexpr(EA), mkU32(ea_off));
+         putIReg( r, mkWidenFrom32(ty, loadBE(Ity_I32, irx_addr ),
+                                       False) );
+         ea_off += 4;
+      }
+      break;
+      
+   case 0x2F: // stmw (Store Multiple Word, PPC32 p527)
+      DIP("stmw r%u,%d(r%u)\n", rS_addr, simm16, rA_addr);
+      for (r = rS_addr; r <= 31; r++) {
+         irx_addr = binop(Iop_Add32, mkexpr(EA), mkU32(ea_off));
+         storeBE( irx_addr, mkNarrowTo32(ty, getIReg(r)) );
+         ea_off += 4;
+      }
+      break;
+      
+   default:
+      vex_printf("dis_int_ldst_mult(ppc)(opc1)\n");
+      return False;
+   }
+   return True;
+}
+
+
+
+/*
+  Integer Load/Store String Instructions
+*/
+static 
+void generate_lsw_sequence ( IRTemp tNBytes,   // # bytes, :: Ity_I32
+                             IRTemp EA,        // EA
+                             Int    rD,        // first dst register
+                             Int    maxBytes ) // 32 or 128
+{
+   Int     i, shift = 24;
+   IRExpr* e_nbytes = mkexpr(tNBytes);
+   IRExpr* e_EA     = mkexpr(EA);
+   IRType  ty       = mode64 ? Ity_I64 : Ity_I32;
+
+   vassert(rD >= 0 && rD < 32);
+   rD--; if (rD < 0) rD = 31;
+
+   for (i = 0; i < maxBytes; i++) {
+      /* if (nBytes < (i+1)) goto NIA; */
+      stmt( IRStmt_Exit( binop(Iop_CmpLT32U, e_nbytes, mkU32(i+1)),
+                         Ijk_Boring, 
+                         mkSzConst( ty, nextInsnAddr()) ));
+      /* when crossing into a new dest register, set it to zero. */
+      if ((i % 4) == 0) {
+         rD++; if (rD == 32) rD = 0;
+         putIReg(rD, mkSzImm(ty, 0));
+         shift = 24;
+      }
+      /* rD |=  (8Uto32(*(EA+i))) << shift */
+      vassert(shift == 0 || shift == 8 || shift == 16 || shift == 24);
+      putIReg( 
+         rD, 
+         mkWidenFrom32(
+            ty, 
+            binop(
+               Iop_Or32, 
+               mkNarrowTo32(ty, getIReg(rD)),
+               binop(
+                  Iop_Shl32, 
+                  unop(
+                     Iop_8Uto32, 
+                     loadBE(Ity_I8, 
+                            binop(mkSzOp(ty,Iop_Add8), e_EA, mkSzImm(ty,i)))
+                  ), 
+                  mkU8(toUChar(shift))
+               )
+            ),
+            /*Signed*/False
+	 ) 
+      ); 
+      shift -= 8;
+   }
+}
+
+static 
+void generate_stsw_sequence ( IRTemp tNBytes,   // # bytes, :: Ity_I32
+                              IRTemp EA,        // EA
+                              Int    rS,        // first src register
+                              Int    maxBytes ) // 32 or 128
+{
+   Int     i, shift = 24;
+   IRExpr* e_nbytes = mkexpr(tNBytes);
+   IRExpr* e_EA     = mkexpr(EA);
+   IRType  ty       = mode64 ? Ity_I64 : Ity_I32;
+
+   vassert(rS >= 0 && rS < 32);
+   rS--; if (rS < 0) rS = 31;
+
+   for (i = 0; i < maxBytes; i++) {
+      /* if (nBytes < (i+1)) goto NIA; */
+      stmt( IRStmt_Exit( binop(Iop_CmpLT32U, e_nbytes, mkU32(i+1)),
+                         Ijk_Boring, 
+                         mkSzConst( ty, nextInsnAddr() ) ));
+      /* check for crossing into a new src register. */
+      if ((i % 4) == 0) {
+         rS++; if (rS == 32) rS = 0;
+         shift = 24;
+      }
+      /* *(EA+i) = 32to8(rS >> shift) */
+      vassert(shift == 0 || shift == 8 || shift == 16 || shift == 24);
+      storeBE(
+         binop(mkSzOp(ty,Iop_Add8), e_EA, mkSzImm(ty,i)),
+         unop(Iop_32to8,
+              binop(Iop_Shr32,
+                    mkNarrowTo32(ty, getIReg(rS)),
+                    mkU8(toUChar(shift))))
+      );
+      shift -= 8;
+   }
+}
+
+static Bool dis_int_ldst_str ( UInt theInstr, /*OUT*/Bool* stopHere )
+{
+   /* X-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar rD_addr  = ifieldRegDS(theInstr);
+   UChar rS_addr  = rD_addr;
+   UChar rA_addr  = ifieldRegA(theInstr);
+   UChar rB_addr  = ifieldRegB(theInstr);
+   UChar NumBytes = rB_addr;
+   UInt  opc2     = ifieldOPClo10(theInstr);
+   UChar b0       = ifieldBIT0(theInstr);
+
+   IRType ty      = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp t_EA    = newTemp(ty);
+   IRTemp t_nbytes = IRTemp_INVALID;
+
+   *stopHere = False;
+
+   if (opc1 != 0x1F || b0 != 0) {
+      vex_printf("dis_int_ldst_str(ppc)(opc1)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   case 0x255: // lswi (Load String Word Immediate, PPC32 p455)
+      /* NB: does not reject the case where RA is in the range of
+         registers to be loaded.  It should. */
+      DIP("lswi r%u,r%u,%d\n", rD_addr, rA_addr, NumBytes);
+      assign( t_EA, ea_rAor0(rA_addr) );
+      if (NumBytes == 8 && !mode64) {
+         /* Special case hack */
+         /* rD = Mem[EA]; (rD+1)%32 = Mem[EA+4] */
+         putIReg( rD_addr,          
+                  loadBE(Ity_I32, mkexpr(t_EA)) );
+         putIReg( (rD_addr+1) % 32, 
+                  loadBE(Ity_I32,
+                         binop(Iop_Add32, mkexpr(t_EA), mkU32(4))) );
+      } else {
+         t_nbytes = newTemp(Ity_I32);
+         assign( t_nbytes, mkU32(NumBytes==0 ? 32 : NumBytes) );
+         generate_lsw_sequence( t_nbytes, t_EA, rD_addr, 32 );
+         *stopHere = True;
+      }
+      return True;
+
+   case 0x215: // lswx (Load String Word Indexed, PPC32 p456)
+      /* NB: does not reject the case where RA is in the range of
+         registers to be loaded.  It should.  Although considering
+         that that can only be detected at run time, it's not easy to
+         do so. */
+      if (rD_addr == rA_addr || rD_addr == rB_addr)
+         return False;
+      if (rD_addr == 0 && rA_addr == 0)
+         return False;
+      DIP("lswx r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+      t_nbytes = newTemp(Ity_I32);
+      assign( t_EA, ea_rAor0_idxd(rA_addr,rB_addr) );
+      assign( t_nbytes, unop( Iop_8Uto32, getXER_BC() ) );
+      generate_lsw_sequence( t_nbytes, t_EA, rD_addr, 128 );
+      *stopHere = True;
+      return True;
+
+   case 0x2D5: // stswi (Store String Word Immediate, PPC32 p528)
+      DIP("stswi r%u,r%u,%d\n", rS_addr, rA_addr, NumBytes);
+      assign( t_EA, ea_rAor0(rA_addr) );
+      if (NumBytes == 8 && !mode64) {
+         /* Special case hack */
+         /* Mem[EA] = rD; Mem[EA+4] = (rD+1)%32 */
+         storeBE( mkexpr(t_EA), 
+                  getIReg(rD_addr) );
+         storeBE( binop(Iop_Add32, mkexpr(t_EA), mkU32(4)), 
+                  getIReg((rD_addr+1) % 32) );
+      } else {
+         t_nbytes = newTemp(Ity_I32);
+         assign( t_nbytes, mkU32(NumBytes==0 ? 32 : NumBytes) );
+         generate_stsw_sequence( t_nbytes, t_EA, rD_addr, 32 );
+         *stopHere = True;
+      }
+      return True;
+
+   case 0x295: // stswx (Store String Word Indexed, PPC32 p529)
+      DIP("stswx r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+      t_nbytes = newTemp(Ity_I32);
+      assign( t_EA, ea_rAor0_idxd(rA_addr,rB_addr) );
+      assign( t_nbytes, unop( Iop_8Uto32, getXER_BC() ) );
+      generate_stsw_sequence( t_nbytes, t_EA, rS_addr, 128 );
+      *stopHere = True;
+      return True;
+
+   default:
+      vex_printf("dis_int_ldst_str(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+
+/* ------------------------------------------------------------------
+   Integer Branch Instructions
+   ------------------------------------------------------------------ */
+
+/*
+  Branch helper function
+  ok = BO[2] | ((CTR[0] != 0) ^ BO[1])
+  Returns an I32 which is 0x00000000 if the ctr condition failed
+  and 0xFFFFFFFF otherwise.
+*/
+static IRExpr* /* :: Ity_I32 */ branch_ctr_ok( UInt BO )
+{
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp ok = newTemp(Ity_I32);
+
+   if ((BO >> 2) & 1) {     // independent of ctr
+      assign( ok, mkU32(0xFFFFFFFF) );
+   } else {
+      if ((BO >> 1) & 1) {  // ctr == 0 ?
+         assign( ok, unop( Iop_1Sto32,
+                           binop( mkSzOp(ty, Iop_CmpEQ8),
+                                  getGST( PPC_GST_CTR ),
+                                  mkSzImm(ty,0))) );
+      } else {              // ctr != 0 ?
+         assign( ok, unop( Iop_1Sto32,
+                           binop( mkSzOp(ty, Iop_CmpNE8),
+                                  getGST( PPC_GST_CTR ),
+                                  mkSzImm(ty,0))) );
+      }
+   }
+   return mkexpr(ok);
+}
+
+
+/*
+  Branch helper function cond_ok = BO[4] | (CR[BI] == BO[3])
+  Returns an I32 which is either 0 if the condition failed or 
+  some arbitrary nonzero value otherwise. */
+
+static IRExpr* /* :: Ity_I32 */ branch_cond_ok( UInt BO, UInt BI )
+{
+   Int where;
+   IRTemp res   = newTemp(Ity_I32);
+   IRTemp cr_bi = newTemp(Ity_I32);
+   
+   if ((BO >> 4) & 1) {
+      assign( res, mkU32(1) );
+   } else {
+      // ok = (CR[BI] == BO[3]) Note, the following relies on
+      // getCRbit_anywhere returning a value which
+      // is either zero or has exactly 1 bit set.  
+      assign( cr_bi, getCRbit_anywhere( BI, &where ) );
+
+      if ((BO >> 3) & 1) {
+         /* We can use cr_bi as-is. */
+         assign( res, mkexpr(cr_bi) );
+      } else {
+         /* We have to invert the sense of the information held in
+            cr_bi.  For that we need to know which bit
+            getCRbit_anywhere regards as significant. */
+         assign( res, binop(Iop_Xor32, mkexpr(cr_bi),
+                                       mkU32(1<<where)) );
+      }
+   }
+   return mkexpr(res);
+}
+
+
+/*
+  Integer Branch Instructions
+*/
+static Bool dis_branch ( UInt theInstr, 
+                         VexAbiInfo* vbi,
+                         /*OUT*/DisResult* dres,
+                         Bool (*resteerOkFn)(void*,Addr64),
+                         void* callback_opaque )
+{
+   UChar opc1    = ifieldOPC(theInstr);
+   UChar BO      = ifieldRegDS(theInstr);
+   UChar BI      = ifieldRegA(theInstr);
+   UInt  BD_u16  = ifieldUIMM16(theInstr) & 0xFFFFFFFC; /* mask off */
+   UChar b11to15 = ifieldRegB(theInstr);
+   UInt  opc2    = ifieldOPClo10(theInstr);
+   UInt  LI_u26  = ifieldUIMM26(theInstr) & 0xFFFFFFFC; /* mask off */
+   UChar flag_AA = ifieldBIT1(theInstr);
+   UChar flag_LK = ifieldBIT0(theInstr);
+
+   IRType   ty        = mode64 ? Ity_I64 : Ity_I32;
+   Addr64   tgt       = 0;
+   Int      BD        = extend_s_16to32(BD_u16);
+   IRTemp   do_branch = newTemp(Ity_I32);
+   IRTemp   ctr_ok    = newTemp(Ity_I32);
+   IRTemp   cond_ok   = newTemp(Ity_I32);
+   IRExpr*  e_nia     = mkSzImm(ty, nextInsnAddr());
+   IRConst* c_nia     = mkSzConst(ty, nextInsnAddr());
+   IRTemp   lr_old    = newTemp(ty);
+
+   /* Hack to pass through code that just wants to read the PC */
+   if (theInstr == 0x429F0005) {
+      DIP("bcl 0x%x, 0x%x (a.k.a mr lr,cia+4)\n", BO, BI);
+      putGST( PPC_GST_LR, e_nia );
+      return True;
+   }
+
+   /* The default what-next.  Individual cases can override it. */    
+   dres->whatNext = Dis_StopHere;
+
+   switch (opc1) {
+   case 0x12: // b     (Branch, PPC32 p360)
+      if (flag_AA) {
+         tgt = mkSzAddr( ty, extend_s_26to64(LI_u26) );
+      } else {
+         tgt = mkSzAddr( ty, guest_CIA_curr_instr +
+                             (Long)extend_s_26to64(LI_u26) );
+      }
+      if (mode64) {
+         DIP("b%s%s 0x%llx\n",
+             flag_LK ? "l" : "", flag_AA ? "a" : "", tgt);
+      } else {
+         DIP("b%s%s 0x%x\n",
+             flag_LK ? "l" : "", flag_AA ? "a" : "", (Addr32)tgt);
+      }
+
+      if (flag_LK) {
+         putGST( PPC_GST_LR, e_nia );
+         if (vbi->guest_ppc_zap_RZ_at_bl
+             && vbi->guest_ppc_zap_RZ_at_bl( (ULong)tgt) ) {
+            IRTemp t_tgt = newTemp(ty);
+            assign(t_tgt, mode64 ? mkU64(tgt) : mkU32(tgt) );
+            make_redzone_AbiHint( vbi, t_tgt,
+                                  "branch-and-link (unconditional call)" );
+         }
+      }
+
+      if (resteerOkFn( callback_opaque, tgt )) {
+         dres->whatNext   = Dis_ResteerU;
+         dres->continueAt = tgt;
+      } else {
+         irsb->jumpkind = flag_LK ? Ijk_Call : Ijk_Boring;
+         irsb->next     = mkSzImm(ty, tgt);
+      }
+      break;
+      
+   case 0x10: // bc    (Branch Conditional, PPC32 p361)
+      DIP("bc%s%s 0x%x, 0x%x, 0x%x\n",
+          flag_LK ? "l" : "", flag_AA ? "a" : "", BO, BI, BD);
+      
+      if (!(BO & 0x4)) {
+         putGST( PPC_GST_CTR,
+                 binop(mkSzOp(ty, Iop_Sub8),
+                       getGST( PPC_GST_CTR ), mkSzImm(ty, 1)) );
+      }
+
+      /* This is a bit subtle.  ctr_ok is either all 0s or all 1s.
+         cond_ok is either zero or nonzero, since that's the cheapest
+         way to compute it.  Anding them together gives a value which
+         is either zero or non zero and so that's what we must test
+         for in the IRStmt_Exit. */
+      assign( ctr_ok,  branch_ctr_ok( BO ) );
+      assign( cond_ok, branch_cond_ok( BO, BI ) );
+      assign( do_branch,
+              binop(Iop_And32, mkexpr(cond_ok), mkexpr(ctr_ok)) );
+
+      if (flag_AA) {
+         tgt = mkSzAddr(ty, extend_s_16to64(BD_u16));
+      } else {
+         tgt = mkSzAddr(ty, guest_CIA_curr_instr +
+                            (Long)extend_s_16to64(BD_u16));
+      }
+      if (flag_LK)
+         putGST( PPC_GST_LR, e_nia );
+      
+      stmt( IRStmt_Exit(
+               binop(Iop_CmpNE32, mkexpr(do_branch), mkU32(0)),
+               flag_LK ? Ijk_Call : Ijk_Boring,
+               mkSzConst(ty, tgt) ) );
+      
+      irsb->jumpkind = Ijk_Boring;
+      irsb->next     = e_nia;
+      break;
+      
+   case 0x13:
+      /* For bclr and bcctr, it appears that the lowest two bits of
+         b11to15 are a branch hint, and so we only need to ensure it's
+         of the form 000XX. */
+      if ((b11to15 & ~3) != 0) {
+         vex_printf("dis_int_branch(ppc)(0x13,b11to15)(%d)\n", (Int)b11to15);
+         return False;
+      }
+
+      switch (opc2) {
+      case 0x210: // bcctr (Branch Cond. to Count Register, PPC32 p363) 
+         if ((BO & 0x4) == 0) { // "decr and test CTR" option invalid
+            vex_printf("dis_int_branch(ppc)(bcctr,BO)\n");
+            return False;
+         }
+         DIP("bcctr%s 0x%x, 0x%x\n", flag_LK ? "l" : "", BO, BI);
+         
+         assign( cond_ok, branch_cond_ok( BO, BI ) );
+
+         /* FIXME: this is confusing.  lr_old holds the old value
+            of ctr, not lr :-) */
+         assign( lr_old, addr_align( getGST( PPC_GST_CTR ), 4 ));
+
+         if (flag_LK)
+            putGST( PPC_GST_LR, e_nia );
+         
+         stmt( IRStmt_Exit(
+                  binop(Iop_CmpEQ32, mkexpr(cond_ok), mkU32(0)),
+                  Ijk_Boring,
+                  c_nia ));
+
+         if (flag_LK && vbi->guest_ppc_zap_RZ_at_bl) {
+            make_redzone_AbiHint( vbi, lr_old,
+                                  "b-ctr-l (indirect call)" );
+	 }
+
+         irsb->jumpkind = flag_LK ? Ijk_Call : Ijk_Boring;
+         irsb->next     = mkexpr(lr_old);
+         break;
+         
+      case 0x010: { // bclr (Branch Cond. to Link Register, PPC32 p365) 
+         Bool vanilla_return = False;
+         if ((BO & 0x14 /* 1z1zz */) == 0x14 && flag_LK == 0) {
+            DIP("blr\n");
+            vanilla_return = True;
+         } else {
+            DIP("bclr%s 0x%x, 0x%x\n", flag_LK ? "l" : "", BO, BI);
+         }
+
+         if (!(BO & 0x4)) {
+            putGST( PPC_GST_CTR,
+                    binop(mkSzOp(ty, Iop_Sub8),
+                          getGST( PPC_GST_CTR ), mkSzImm(ty, 1)) );
+         }
+         
+         /* See comments above for 'bc' about this */
+         assign( ctr_ok,  branch_ctr_ok( BO ) );
+         assign( cond_ok, branch_cond_ok( BO, BI ) );
+         assign( do_branch,
+                 binop(Iop_And32, mkexpr(cond_ok), mkexpr(ctr_ok)) );
+         
+         assign( lr_old, addr_align( getGST( PPC_GST_LR ), 4 ));
+
+         if (flag_LK)
+            putGST( PPC_GST_LR,  e_nia );
+
+         stmt( IRStmt_Exit(
+                  binop(Iop_CmpEQ32, mkexpr(do_branch), mkU32(0)),
+                  Ijk_Boring,
+                  c_nia ));
+
+         if (vanilla_return && vbi->guest_ppc_zap_RZ_at_blr) {
+            make_redzone_AbiHint( vbi, lr_old,
+                                  "branch-to-lr (unconditional return)" );
+         }
+
+         /* blrl is pretty strange; it's like a return that sets the
+            return address of its caller to the insn following this
+            one.  Mark it as a return. */
+         irsb->jumpkind = Ijk_Ret;  /* was flag_LK ? Ijk_Call : Ijk_Ret; */
+         irsb->next     = mkexpr(lr_old);
+         break;
+      }
+      default:
+         vex_printf("dis_int_branch(ppc)(opc2)\n");
+         return False;
+      }
+      break;
+      
+   default:
+      vex_printf("dis_int_branch(ppc)(opc1)\n");
+      return False;
+   }
+   
+   return True;
+}
+
+
+
+/*
+  Condition Register Logical Instructions
+*/
+static Bool dis_cond_logic ( UInt theInstr )
+{
+   /* XL-Form */
+   UChar opc1      = ifieldOPC(theInstr);
+   UChar crbD_addr = ifieldRegDS(theInstr);
+   UChar crfD_addr = toUChar( IFIELD(theInstr, 23, 3) );
+   UChar crbA_addr = ifieldRegA(theInstr);
+   UChar crfS_addr = toUChar( IFIELD(theInstr, 18, 3) );
+   UChar crbB_addr = ifieldRegB(theInstr);
+   UInt  opc2      = ifieldOPClo10(theInstr);
+   UChar b0        = ifieldBIT0(theInstr);
+
+   IRTemp crbD     = newTemp(Ity_I32);
+   IRTemp crbA     = newTemp(Ity_I32);
+   IRTemp crbB     = newTemp(Ity_I32);
+
+   if (opc1 != 19 || b0 != 0) {
+      vex_printf("dis_cond_logic(ppc)(opc1)\n");
+      return False;
+   }
+
+   if (opc2 == 0) {  // mcrf    (Move Cond Reg Field, PPC32 p464)
+      if (((crbD_addr & 0x3) != 0) ||
+          ((crbA_addr & 0x3) != 0) || (crbB_addr != 0)) {
+         vex_printf("dis_cond_logic(ppc)(crbD|crbA|crbB != 0)\n");
+         return False;
+      }
+      DIP("mcrf cr%u,cr%u\n", crfD_addr, crfS_addr);
+      putCR0(   crfD_addr, getCR0(  crfS_addr) );
+      putCR321( crfD_addr, getCR321(crfS_addr) );
+   } else {
+      assign( crbA, getCRbit(crbA_addr) );
+      if (crbA_addr == crbB_addr)
+         crbB = crbA;
+      else
+         assign( crbB, getCRbit(crbB_addr) );
+
+      switch (opc2) {
+      case 0x101: // crand   (Cond Reg AND, PPC32 p372)
+         DIP("crand crb%d,crb%d,crb%d\n", crbD_addr, crbA_addr, crbB_addr);
+         assign( crbD, binop(Iop_And32, mkexpr(crbA), mkexpr(crbB)) );
+         break;
+      case 0x081: // crandc  (Cond Reg AND w. Complement, PPC32 p373)
+         DIP("crandc crb%d,crb%d,crb%d\n", crbD_addr, crbA_addr, crbB_addr);
+         assign( crbD, binop(Iop_And32, 
+                             mkexpr(crbA),
+                             unop(Iop_Not32, mkexpr(crbB))) );
+         break;
+      case 0x121: // creqv   (Cond Reg Equivalent, PPC32 p374)
+         DIP("creqv crb%d,crb%d,crb%d\n", crbD_addr, crbA_addr, crbB_addr);
+         assign( crbD, unop(Iop_Not32,
+                            binop(Iop_Xor32, mkexpr(crbA), mkexpr(crbB))) );
+         break;
+      case 0x0E1: // crnand  (Cond Reg NAND, PPC32 p375)
+         DIP("crnand crb%d,crb%d,crb%d\n", crbD_addr, crbA_addr, crbB_addr);
+         assign( crbD, unop(Iop_Not32,
+                            binop(Iop_And32, mkexpr(crbA), mkexpr(crbB))) );
+         break;
+      case 0x021: // crnor   (Cond Reg NOR, PPC32 p376)
+         DIP("crnor crb%d,crb%d,crb%d\n", crbD_addr, crbA_addr, crbB_addr);
+         assign( crbD, unop(Iop_Not32,
+                            binop(Iop_Or32, mkexpr(crbA), mkexpr(crbB))) );
+         break;
+      case 0x1C1: // cror    (Cond Reg OR, PPC32 p377)
+         DIP("cror crb%d,crb%d,crb%d\n", crbD_addr, crbA_addr, crbB_addr);
+         assign( crbD, binop(Iop_Or32, mkexpr(crbA), mkexpr(crbB)) );
+         break;
+      case 0x1A1: // crorc   (Cond Reg OR w. Complement, PPC32 p378)
+         DIP("crorc crb%d,crb%d,crb%d\n", crbD_addr, crbA_addr, crbB_addr);
+         assign( crbD, binop(Iop_Or32, 
+                             mkexpr(crbA),
+                             unop(Iop_Not32, mkexpr(crbB))) );
+         break;
+      case 0x0C1: // crxor   (Cond Reg XOR, PPC32 p379)
+         DIP("crxor crb%d,crb%d,crb%d\n", crbD_addr, crbA_addr, crbB_addr);
+         assign( crbD, binop(Iop_Xor32, mkexpr(crbA), mkexpr(crbB)) );
+         break;
+      default:
+         vex_printf("dis_cond_logic(ppc)(opc2)\n");
+         return False;
+      }
+
+      putCRbit( crbD_addr, mkexpr(crbD) );
+   }
+   return True;
+}
+
+
+/* 
+  Trap instructions
+*/
+
+/* Do the code generation for a trap.  Returned Bool is true iff
+   this is an unconditional trap.  If the two arg IRExpr*s are 
+   Ity_I32s then the comparison is 32-bit.  If they are Ity_I64s
+   then they are 64-bit, and we must be disassembling 64-bit
+   instructions. */
+static Bool do_trap ( UChar TO, 
+                      IRExpr* argL0, IRExpr* argR0, Addr64 cia )
+{
+   IRTemp argL, argR;
+   IRExpr *argLe, *argRe, *cond, *tmp;
+
+   Bool    is32bit = typeOfIRExpr(irsb->tyenv, argL0 ) == Ity_I32;
+
+   IROp    opAND     = is32bit ? Iop_And32     : Iop_And64;
+   IROp    opOR      = is32bit ? Iop_Or32      : Iop_Or64;
+   IROp    opCMPORDS = is32bit ? Iop_CmpORD32S : Iop_CmpORD64S;
+   IROp    opCMPORDU = is32bit ? Iop_CmpORD32U : Iop_CmpORD64U;
+   IROp    opCMPNE   = is32bit ? Iop_CmpNE32   : Iop_CmpNE64;
+   IROp    opCMPEQ   = is32bit ? Iop_CmpEQ32   : Iop_CmpEQ64;
+   IRExpr* const0    = is32bit ? mkU32(0)      : mkU64(0);
+   IRExpr* const2    = is32bit ? mkU32(2)      : mkU64(2);
+   IRExpr* const4    = is32bit ? mkU32(4)      : mkU64(4);
+   IRExpr* const8    = is32bit ? mkU32(8)      : mkU64(8);
+
+   const UChar b11100 = 0x1C;
+   const UChar b00111 = 0x07;
+
+   if (is32bit) {
+      vassert( typeOfIRExpr(irsb->tyenv, argL0) == Ity_I32 );
+      vassert( typeOfIRExpr(irsb->tyenv, argR0) == Ity_I32 );
+   } else {
+      vassert( typeOfIRExpr(irsb->tyenv, argL0) == Ity_I64 );
+      vassert( typeOfIRExpr(irsb->tyenv, argR0) == Ity_I64 );
+      vassert( mode64 );
+   }
+
+   if ((TO & b11100) == b11100 || (TO & b00111) == b00111) {
+      /* Unconditional trap.  Just do the exit without 
+         testing the arguments. */
+      stmt( IRStmt_Exit( 
+               binop(opCMPEQ, const0, const0), 
+               Ijk_SigTRAP,
+               mode64 ? IRConst_U64(cia) : IRConst_U32((UInt)cia) 
+      ));
+      return True; /* unconditional trap */
+   }
+
+   if (is32bit) {
+      argL = newTemp(Ity_I32);
+      argR = newTemp(Ity_I32);
+   } else {
+      argL = newTemp(Ity_I64);
+      argR = newTemp(Ity_I64);
+   }
+
+   assign( argL, argL0 );
+   assign( argR, argR0 );
+
+   argLe = mkexpr(argL);
+   argRe = mkexpr(argR);
+
+   cond = const0;
+   if (TO & 16) { // L <s R
+      tmp = binop(opAND, binop(opCMPORDS, argLe, argRe), const8);
+      cond = binop(opOR, tmp, cond);
+   }
+   if (TO & 8) { // L >s R
+      tmp = binop(opAND, binop(opCMPORDS, argLe, argRe), const4);
+      cond = binop(opOR, tmp, cond);
+   }
+   if (TO & 4) { // L == R
+      tmp = binop(opAND, binop(opCMPORDS, argLe, argRe), const2);
+      cond = binop(opOR, tmp, cond);
+   }
+   if (TO & 2) { // L <u R
+      tmp = binop(opAND, binop(opCMPORDU, argLe, argRe), const8);
+      cond = binop(opOR, tmp, cond);
+   }
+   if (TO & 1) { // L >u R
+      tmp = binop(opAND, binop(opCMPORDU, argLe, argRe), const4);
+      cond = binop(opOR, tmp, cond);
+   }
+   stmt( IRStmt_Exit( 
+            binop(opCMPNE, cond, const0), 
+            Ijk_SigTRAP,
+            mode64 ? IRConst_U64(cia) : IRConst_U32((UInt)cia) 
+   ));
+   return False; /* not an unconditional trap */
+}
+
+static Bool dis_trapi ( UInt theInstr,
+                        /*OUT*/DisResult* dres )
+{
+   /* D-Form */
+   UChar  opc1    = ifieldOPC(theInstr);
+   UChar  TO      = ifieldRegDS(theInstr);
+   UChar  rA_addr = ifieldRegA(theInstr);
+   UInt   uimm16  = ifieldUIMM16(theInstr);
+   ULong  simm16  = extend_s_16to64(uimm16);
+   Addr64 cia     = guest_CIA_curr_instr;
+   IRType ty      = mode64 ? Ity_I64 : Ity_I32;
+   Bool   uncond  = False;
+
+   switch (opc1) {
+   case 0x03: // twi  (Trap Word Immediate, PPC32 p548)
+      uncond = do_trap( TO, 
+                        mode64 ? unop(Iop_64to32, getIReg(rA_addr)) 
+                               : getIReg(rA_addr),
+                        mkU32( (UInt)simm16 ),
+                        cia );
+      if (TO == 4) {
+         DIP("tweqi r%u,%d\n", (UInt)rA_addr, (Int)simm16);
+      } else {
+         DIP("tw%di r%u,%d\n", (Int)TO, (UInt)rA_addr, (Int)simm16);
+      }
+      break;
+   case 0x02: // tdi
+      if (!mode64)
+         return False;
+      uncond = do_trap( TO, getIReg(rA_addr), mkU64( (ULong)simm16 ), cia );
+      if (TO == 4) {
+         DIP("tdeqi r%u,%d\n", (UInt)rA_addr, (Int)simm16);
+      } else {
+         DIP("td%di r%u,%d\n", (Int)TO, (UInt)rA_addr, (Int)simm16);
+      }
+      break;
+   default:
+      return False;
+   }
+
+   if (uncond) {
+      /* If the trap shows signs of being unconditional, don't
+         continue decoding past it. */
+      irsb->next     = mkSzImm( ty, nextInsnAddr() );
+      irsb->jumpkind = Ijk_Boring;
+      dres->whatNext = Dis_StopHere;
+   }
+
+   return True;
+}
+
+static Bool dis_trap ( UInt theInstr,
+                        /*OUT*/DisResult* dres )
+{
+   /* X-Form */
+   UInt   opc2    = ifieldOPClo10(theInstr);
+   UChar  TO      = ifieldRegDS(theInstr);
+   UChar  rA_addr = ifieldRegA(theInstr);
+   UChar  rB_addr = ifieldRegB(theInstr);
+   Addr64 cia     = guest_CIA_curr_instr;
+   IRType ty      = mode64 ? Ity_I64 : Ity_I32;
+   Bool   uncond  = False;
+
+   if (ifieldBIT0(theInstr) != 0)
+      return False;
+
+   switch (opc2) {
+   case 0x004: // tw  (Trap Word, PPC64 p540)
+      uncond = do_trap( TO, 
+                        mode64 ? unop(Iop_64to32, getIReg(rA_addr)) 
+                               : getIReg(rA_addr),
+                        mode64 ? unop(Iop_64to32, getIReg(rB_addr)) 
+                               : getIReg(rB_addr),
+                        cia );
+      if (TO == 4) {
+         DIP("tweq r%u,r%u\n", (UInt)rA_addr, (UInt)rB_addr);
+      } else {
+         DIP("tw%d r%u,r%u\n", (Int)TO, (UInt)rA_addr, (UInt)rB_addr);
+      }
+      break;
+   case 0x044: // td (Trap Doubleword, PPC64 p534)
+      if (!mode64)
+         return False;
+      uncond = do_trap( TO, getIReg(rA_addr), getIReg(rB_addr), cia );
+      if (TO == 4) {
+         DIP("tdeq r%u,r%u\n", (UInt)rA_addr, (UInt)rB_addr);
+      } else {
+         DIP("td%d r%u,r%u\n", (Int)TO, (UInt)rA_addr, (UInt)rB_addr);
+      }
+      break;
+   default:
+      return False;
+   }
+
+   if (uncond) {
+      /* If the trap shows signs of being unconditional, don't
+         continue decoding past it. */
+      irsb->next     = mkSzImm( ty, nextInsnAddr() );
+      irsb->jumpkind = Ijk_Boring;
+      dres->whatNext = Dis_StopHere;
+   }
+
+   return True;
+}
+
+
+/*
+  System Linkage Instructions
+*/
+static Bool dis_syslink ( UInt theInstr, 
+                          VexAbiInfo* abiinfo, DisResult* dres )
+{
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+
+   if (theInstr != 0x44000002) {
+      vex_printf("dis_syslink(ppc)(theInstr)\n");
+      return False;
+   }
+
+   // sc  (System Call, PPC32 p504)
+   DIP("sc\n");
+
+   /* Copy CIA into the IP_AT_SYSCALL pseudo-register, so that on AIX
+      Valgrind can back the guest up to this instruction if it needs
+      to restart the syscall. */
+   putGST( PPC_GST_IP_AT_SYSCALL, getGST( PPC_GST_CIA ) );
+
+   /* It's important that all ArchRegs carry their up-to-date value
+      at this point.  So we declare an end-of-block here, which
+      forces any TempRegs caching ArchRegs to be flushed. */
+   irsb->next     = abiinfo->guest_ppc_sc_continues_at_LR
+                       ? getGST( PPC_GST_LR )
+                       : mkSzImm( ty, nextInsnAddr() );
+   irsb->jumpkind = Ijk_Sys_syscall;
+
+   dres->whatNext = Dis_StopHere;
+   return True;
+}
+
+
+/*
+  Memory Synchronization Instructions
+
+  Note on Reservations:
+  We rely on the assumption that V will in fact only allow one thread at
+  once to run.  In effect, a thread can make a reservation, but we don't
+  check any stores it does.  Instead, the reservation is cancelled when
+  the scheduler switches to another thread (run_thread_for_a_while()).
+*/
+static Bool dis_memsync ( UInt theInstr )
+{
+   /* X-Form, XL-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UInt  b11to25 = IFIELD(theInstr, 11, 15);
+   UChar flag_L  = ifieldRegDS(theInstr);
+   UInt  b11to20 = IFIELD(theInstr, 11, 10);
+   UChar rD_addr = ifieldRegDS(theInstr);
+   UChar rS_addr = rD_addr;
+   UChar rA_addr = ifieldRegA(theInstr);
+   UChar rB_addr = ifieldRegB(theInstr);
+   UInt  opc2    = ifieldOPClo10(theInstr);
+   UChar b0      = ifieldBIT0(theInstr);
+
+   IRType ty     = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp EA     = newTemp(ty);
+
+   assign( EA, ea_rAor0_idxd( rA_addr, rB_addr ) );
+
+   switch (opc1) {
+   /* XL-Form */
+   case 0x13:   // isync (Instruction Synchronize, PPC32 p432)
+      if (opc2 != 0x096) {
+         vex_printf("dis_memsync(ppc)(0x13,opc2)\n");
+         return False;
+      }
+      if (b11to25 != 0 || b0 != 0) {
+         vex_printf("dis_memsync(ppc)(0x13,b11to25|b0)\n");
+         return False;
+      }
+      DIP("isync\n");
+      stmt( IRStmt_MBE(Imbe_Fence) );
+      break;
+
+   /* X-Form */
+   case 0x1F:
+      switch (opc2) {
+      case 0x356: // eieio (Enforce In-Order Exec of I/O, PPC32 p394)
+         if (b11to25 != 0 || b0 != 0) {
+            vex_printf("dis_memsync(ppc)(eiei0,b11to25|b0)\n");
+            return False;
+         }
+         DIP("eieio\n");
+         /* Insert a memory fence, just to be on the safe side. */
+         stmt( IRStmt_MBE(Imbe_Fence) );
+         break;
+
+      case 0x014: { // lwarx (Load Word and Reserve Indexed, PPC32 p458)
+         IRTemp res;
+         /* According to the PowerPC ISA version 2.05, b0 (called EH
+            in the documentation) is merely a hint bit to the
+            hardware, I think as to whether or not contention is
+            likely.  So we can just ignore it. */
+         DIP("lwarx r%u,r%u,r%u,EH=%u\n", rD_addr, rA_addr, rB_addr, (UInt)b0);
+
+         // trap if misaligned
+         gen_SIGBUS_if_misaligned( EA, 4 );
+
+         // and actually do the load
+         res = newTemp(Ity_I32);
+         stmt( IRStmt_LLSC(Iend_BE, res, mkexpr(EA), NULL/*this is a load*/) );
+
+         putIReg( rD_addr, mkWidenFrom32(ty, mkexpr(res), False) );
+         break;
+      }
+
+      case 0x096: { 
+         // stwcx. (Store Word Conditional Indexed, PPC32 p532)
+         // Note this has to handle stwcx. in both 32- and 64-bit modes,
+         // so isn't quite as straightforward as it might otherwise be.
+         IRTemp rS = newTemp(Ity_I32);
+         IRTemp resSC;
+         if (b0 != 1) {
+            vex_printf("dis_memsync(ppc)(stwcx.,b0)\n");
+            return False;
+         }
+         DIP("stwcx. r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+
+         // trap if misaligned
+         gen_SIGBUS_if_misaligned( EA, 4 );
+
+         // Get the data to be stored, and narrow to 32 bits if necessary
+         assign( rS, mkNarrowTo32(ty, getIReg(rS_addr)) );
+
+         // Do the store, and get success/failure bit into resSC
+         resSC = newTemp(Ity_I1);
+         stmt( IRStmt_LLSC(Iend_BE, resSC, mkexpr(EA), mkexpr(rS)) );
+
+         // Set CR0[LT GT EQ S0] = 0b000 || XER[SO]  on failure
+         // Set CR0[LT GT EQ S0] = 0b001 || XER[SO]  on success
+         putCR321(0, binop(Iop_Shl8, unop(Iop_1Uto8, mkexpr(resSC)), mkU8(1)));
+         putCR0(0, getXER_SO());
+
+         /* Note:
+            If resaddr != lwarx_resaddr, CR0[EQ] is undefined, and
+            whether rS is stored is dependent on that value. */
+         /* So I guess we can just ignore this case? */
+         break;
+      }
+
+      case 0x256: // sync (Synchronize, PPC32 p543), 
+                  // also lwsync (L==1), ptesync (L==2)
+         /* http://sources.redhat.com/ml/binutils/2000-12/msg00311.html
+
+            The PowerPC architecture used in IBM chips has expanded
+            the sync instruction into two variants: lightweight sync
+            and heavyweight sync.  The original sync instruction is
+            the new heavyweight sync and lightweight sync is a strict
+            subset of the heavyweight sync functionality. This allows
+            the programmer to specify a less expensive operation on
+            high-end systems when the full sync functionality is not
+            necessary.
+
+            The basic "sync" mnemonic now utilizes an operand. "sync"
+            without an operand now becomes a extended mnemonic for
+            heavyweight sync.  Processors without the lwsync
+            instruction will not decode the L field and will perform a
+            heavyweight sync.  Everything is backward compatible.
+
+            sync    =       sync 0
+            lwsync  =       sync 1
+            ptesync =       sync 2    *** TODO - not implemented ***
+         */
+         if (b11to20 != 0 || b0 != 0) {
+            vex_printf("dis_memsync(ppc)(sync/lwsync,b11to20|b0)\n");
+            return False;
+         }
+         if (flag_L != 0/*sync*/ && flag_L != 1/*lwsync*/) {
+            vex_printf("dis_memsync(ppc)(sync/lwsync,flag_L)\n");
+            return False;
+         }
+         DIP("%ssync\n", flag_L == 1 ? "lw" : "");
+         /* Insert a memory fence.  It's sometimes important that these
+            are carried through to the generated code. */
+         stmt( IRStmt_MBE(Imbe_Fence) );
+         break;
+
+      /* 64bit Memsync */
+      case 0x054: { // ldarx (Load DWord and Reserve Indexed, PPC64 p473)
+         IRTemp res;
+         /* According to the PowerPC ISA version 2.05, b0 (called EH
+            in the documentation) is merely a hint bit to the
+            hardware, I think as to whether or not contention is
+            likely.  So we can just ignore it. */
+         if (!mode64)
+            return False;
+         DIP("ldarx r%u,r%u,r%u,EH=%u\n", rD_addr, rA_addr, rB_addr, (UInt)b0);
+
+         // trap if misaligned
+         gen_SIGBUS_if_misaligned( EA, 8 );
+
+         // and actually do the load
+         res = newTemp(Ity_I64);
+         stmt( IRStmt_LLSC(Iend_BE, res, mkexpr(EA), NULL/*this is a load*/) );
+
+         putIReg( rD_addr, mkexpr(res) );
+         break;
+      }
+      
+      case 0x0D6: { // stdcx. (Store DWord Condition Indexd, PPC64 p581)
+         // A marginally simplified version of the stwcx. case
+         IRTemp rS = newTemp(Ity_I64);
+         IRTemp resSC;
+         if (b0 != 1) {
+            vex_printf("dis_memsync(ppc)(stdcx.,b0)\n");
+            return False;
+         }
+         if (!mode64)
+            return False;
+         DIP("stdcx. r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+
+         // trap if misaligned
+         gen_SIGBUS_if_misaligned( EA, 8 );
+
+         // Get the data to be stored
+         assign( rS, getIReg(rS_addr) );
+
+         // Do the store, and get success/failure bit into resSC
+         resSC = newTemp(Ity_I1);
+         stmt( IRStmt_LLSC(Iend_BE, resSC, mkexpr(EA), mkexpr(rS)) );
+
+         // Set CR0[LT GT EQ S0] = 0b000 || XER[SO]  on failure
+         // Set CR0[LT GT EQ S0] = 0b001 || XER[SO]  on success
+         putCR321(0, binop(Iop_Shl8, unop(Iop_1Uto8, mkexpr(resSC)), mkU8(1)));
+         putCR0(0, getXER_SO());
+
+         /* Note:
+            If resaddr != lwarx_resaddr, CR0[EQ] is undefined, and
+            whether rS is stored is dependent on that value. */
+         /* So I guess we can just ignore this case? */
+         break;
+      }
+
+      default:
+         vex_printf("dis_memsync(ppc)(opc2)\n");
+         return False;
+      }
+      break;
+
+   default:
+      vex_printf("dis_memsync(ppc)(opc1)\n");
+      return False;
+   }
+   return True;
+}
+
+
+
+/*
+  Integer Shift Instructions
+*/
+static Bool dis_int_shift ( UInt theInstr )
+{
+   /* X-Form, XS-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UChar rS_addr = ifieldRegDS(theInstr);
+   UChar rA_addr = ifieldRegA(theInstr);
+   UChar rB_addr = ifieldRegB(theInstr);
+   UChar sh_imm  = rB_addr;
+   UInt  opc2    = ifieldOPClo10(theInstr);
+   UChar b1      = ifieldBIT1(theInstr);
+   UChar flag_rC = ifieldBIT0(theInstr);
+
+   IRType  ty         = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp  rA         = newTemp(ty);
+   IRTemp  rS         = newTemp(ty);
+   IRTemp  rB         = newTemp(ty);
+   IRTemp  outofrange = newTemp(Ity_I8);
+   IRTemp  rS_lo32    = newTemp(Ity_I32);
+   IRTemp  rB_lo32    = newTemp(Ity_I32);
+   IRExpr* e_tmp;
+
+   assign( rS, getIReg(rS_addr) );
+   assign( rB, getIReg(rB_addr) );
+   assign( rS_lo32, mkNarrowTo32(ty, mkexpr(rS)) );
+   assign( rB_lo32, mkNarrowTo32(ty, mkexpr(rB)) );
+   
+   if (opc1 == 0x1F) {
+      switch (opc2) {
+      case 0x018: { // slw (Shift Left Word, PPC32 p505)
+         DIP("slw%s r%u,r%u,r%u\n", flag_rC ? ".":"",
+             rA_addr, rS_addr, rB_addr);
+         /* rA = rS << rB */
+         /* ppc32 semantics are: 
+            slw(x,y) = (x << (y & 31))         -- primary result
+                       & ~((y << 26) >>s 31)   -- make result 0 
+                                                  for y in 32 .. 63
+         */
+         e_tmp =
+            binop( Iop_And32,
+               binop( Iop_Shl32,
+                      mkexpr(rS_lo32), 
+                      unop( Iop_32to8,
+                            binop(Iop_And32,
+                                  mkexpr(rB_lo32), mkU32(31)))),
+               unop( Iop_Not32,
+                     binop( Iop_Sar32,
+                            binop(Iop_Shl32, mkexpr(rB_lo32), mkU8(26)),
+                            mkU8(31))) );
+         assign( rA, mkWidenFrom32(ty, e_tmp, /* Signed */False) );
+         break;
+      }
+
+      case 0x318: { // sraw (Shift Right Alg Word, PPC32 p506)
+         IRTemp sh_amt = newTemp(Ity_I32);
+         DIP("sraw%s r%u,r%u,r%u\n", flag_rC ? ".":"",
+             rA_addr, rS_addr, rB_addr);
+         /* JRS: my reading of the (poorly worded) PPC32 doc p506 is:
+            amt = rB & 63
+            rA = Sar32( rS, amt > 31 ? 31 : amt )
+            XER.CA = amt > 31 ? sign-of-rS : (computation as per srawi)
+         */
+         assign( sh_amt, binop(Iop_And32, mkU32(0x3F),
+                                          mkexpr(rB_lo32)) );
+         assign( outofrange,
+                 unop( Iop_1Uto8, 
+                       binop(Iop_CmpLT32U, mkU32(31),
+                                           mkexpr(sh_amt)) ));
+         e_tmp = binop( Iop_Sar32, 
+                        mkexpr(rS_lo32), 
+                        unop( Iop_32to8, 
+                              IRExpr_Mux0X( mkexpr(outofrange), 
+                                            mkexpr(sh_amt), 
+                                            mkU32(31)) ) );
+         assign( rA, mkWidenFrom32(ty, e_tmp, /* Signed */True) );
+
+         set_XER_CA( ty, PPCG_FLAG_OP_SRAW,
+                     mkexpr(rA),
+                     mkWidenFrom32(ty, mkexpr(rS_lo32), True),
+                     mkWidenFrom32(ty, mkexpr(sh_amt), True ),
+                     mkWidenFrom32(ty, getXER_CA32(), True) );
+         break;
+      }
+         
+      case 0x338: // srawi (Shift Right Alg Word Immediate, PPC32 p507)
+         DIP("srawi%s r%u,r%u,%d\n", flag_rC ? ".":"",
+             rA_addr, rS_addr, sh_imm);
+         vassert(sh_imm < 32);
+         if (mode64) {
+            assign( rA, binop(Iop_Sar64,
+                              binop(Iop_Shl64, getIReg(rS_addr),
+                                               mkU8(32)),
+                              mkU8(32 + sh_imm)) );
+         } else {
+            assign( rA, binop(Iop_Sar32, mkexpr(rS_lo32),
+                                         mkU8(sh_imm)) );
+         }
+
+         set_XER_CA( ty, PPCG_FLAG_OP_SRAWI, 
+                     mkexpr(rA),
+                     mkWidenFrom32(ty, mkexpr(rS_lo32), /* Syned */True),
+                     mkSzImm(ty, sh_imm),
+                     mkWidenFrom32(ty, getXER_CA32(), /* Syned */False) );
+         break;
+      
+      case 0x218: // srw (Shift Right Word, PPC32 p508)
+         DIP("srw%s r%u,r%u,r%u\n", flag_rC ? ".":"",
+             rA_addr, rS_addr, rB_addr);
+         /* rA = rS >>u rB */
+         /* ppc32 semantics are: 
+            srw(x,y) = (x >>u (y & 31))        -- primary result
+                       & ~((y << 26) >>s 31)   -- make result 0 
+                                                  for y in 32 .. 63
+         */
+         e_tmp = 
+            binop(
+               Iop_And32,
+               binop( Iop_Shr32, 
+                      mkexpr(rS_lo32), 
+                      unop( Iop_32to8, 
+                            binop(Iop_And32, mkexpr(rB_lo32),
+                                             mkU32(31)))),
+               unop( Iop_Not32, 
+                     binop( Iop_Sar32, 
+                            binop(Iop_Shl32, mkexpr(rB_lo32),
+                                             mkU8(26)), 
+                            mkU8(31))));
+         assign( rA, mkWidenFrom32(ty, e_tmp, /* Signed */False) );
+         break;
+
+
+      /* 64bit Shifts */
+      case 0x01B: // sld (Shift Left DWord, PPC64 p568)
+         DIP("sld%s r%u,r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr, rB_addr);
+         /* rA = rS << rB */
+         /* ppc64 semantics are: 
+            slw(x,y) = (x << (y & 63))         -- primary result
+                       & ~((y << 57) >>s 63)   -- make result 0 
+                                                  for y in 64 .. 
+         */
+         assign( rA,
+            binop(
+               Iop_And64,
+               binop( Iop_Shl64,
+                      mkexpr(rS), 
+                      unop( Iop_64to8, 
+                            binop(Iop_And64, mkexpr(rB), mkU64(63)))),
+               unop( Iop_Not64,
+                     binop( Iop_Sar64,
+                            binop(Iop_Shl64, mkexpr(rB), mkU8(57)), 
+                            mkU8(63)))) );
+         break;
+      
+      case 0x31A: { // srad (Shift Right Alg DWord, PPC64 p570)
+         IRTemp sh_amt = newTemp(Ity_I64);
+         DIP("srad%s r%u,r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr, rB_addr);
+         /* amt = rB & 127
+            rA = Sar64( rS, amt > 63 ? 63 : amt )
+            XER.CA = amt > 63 ? sign-of-rS : (computation as per srawi)
+         */
+         assign( sh_amt, binop(Iop_And64, mkU64(0x7F), mkexpr(rB)) );
+         assign( outofrange,
+                 unop( Iop_1Uto8, 
+                       binop(Iop_CmpLT64U, mkU64(63),
+                                           mkexpr(sh_amt)) ));
+         assign( rA,
+                 binop( Iop_Sar64, 
+                        mkexpr(rS), 
+                        unop( Iop_64to8, 
+                              IRExpr_Mux0X( mkexpr(outofrange), 
+                                            mkexpr(sh_amt), 
+                                            mkU64(63)) ))
+               );
+         set_XER_CA( ty, PPCG_FLAG_OP_SRAD,
+                     mkexpr(rA), mkexpr(rS), mkexpr(sh_amt),
+                     mkWidenFrom32(ty, getXER_CA32(), /* Syned */False) );
+         break;
+      }
+
+      case 0x33A: case 0x33B: // sradi (Shr Alg DWord Imm, PPC64 p571)
+         sh_imm |= b1<<5;
+         vassert(sh_imm < 64);
+         DIP("sradi%s r%u,r%u,%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr, sh_imm);
+         assign( rA, binop(Iop_Sar64, getIReg(rS_addr), mkU8(sh_imm)) );
+
+         set_XER_CA( ty, PPCG_FLAG_OP_SRADI, 
+                     mkexpr(rA),
+                     getIReg(rS_addr),
+                     mkU64(sh_imm), 
+                     mkWidenFrom32(ty, getXER_CA32(), /* Syned */False) );
+         break;
+
+      case 0x21B: // srd (Shift Right DWord, PPC64 p574)
+         DIP("srd%s r%u,r%u,r%u\n",
+             flag_rC ? ".":"", rA_addr, rS_addr, rB_addr);
+         /* rA = rS >>u rB */
+         /* ppc semantics are: 
+            srw(x,y) = (x >>u (y & 63))        -- primary result
+                       & ~((y << 57) >>s 63)   -- make result 0 
+                                                  for y in 64 .. 127
+         */
+         assign( rA,
+            binop(
+               Iop_And64,
+               binop( Iop_Shr64, 
+                      mkexpr(rS), 
+                      unop( Iop_64to8, 
+                            binop(Iop_And64, mkexpr(rB), mkU64(63)))),
+               unop( Iop_Not64, 
+                     binop( Iop_Sar64, 
+                            binop(Iop_Shl64, mkexpr(rB), mkU8(57)), 
+                            mkU8(63)))) );
+         break;
+     
+      default:
+         vex_printf("dis_int_shift(ppc)(opc2)\n");
+         return False;
+      }
+   } else {
+      vex_printf("dis_int_shift(ppc)(opc1)\n");
+      return False;
+   }
+
+   putIReg( rA_addr, mkexpr(rA) );
+   
+   if (flag_rC) {
+      set_CR0( mkexpr(rA) );
+   }
+   return True;
+}
+
+
+
+/*
+  Integer Load/Store Reverse Instructions
+*/
+/* Generates code to swap the byte order in an Ity_I32. */
+static IRExpr* /* :: Ity_I32 */ gen_byterev32 ( IRTemp t )
+{
+   vassert(typeOfIRTemp(irsb->tyenv, t) == Ity_I32);
+   return
+      binop(Iop_Or32,
+         binop(Iop_Shl32, mkexpr(t), mkU8(24)),
+      binop(Iop_Or32,
+         binop(Iop_And32, binop(Iop_Shl32, mkexpr(t), mkU8(8)), 
+                          mkU32(0x00FF0000)),
+      binop(Iop_Or32,
+         binop(Iop_And32, binop(Iop_Shr32, mkexpr(t), mkU8(8)),
+                          mkU32(0x0000FF00)),
+         binop(Iop_And32, binop(Iop_Shr32, mkexpr(t), mkU8(24)),
+                          mkU32(0x000000FF) )
+      )));
+}
+
+/* Generates code to swap the byte order in the lower half of an Ity_I32,
+   and zeroes the upper half. */
+static IRExpr* /* :: Ity_I32 */ gen_byterev16 ( IRTemp t )
+{
+   vassert(typeOfIRTemp(irsb->tyenv, t) == Ity_I32);
+   return
+      binop(Iop_Or32,
+         binop(Iop_And32, binop(Iop_Shl32, mkexpr(t), mkU8(8)),
+                          mkU32(0x0000FF00)),
+         binop(Iop_And32, binop(Iop_Shr32, mkexpr(t), mkU8(8)),
+                          mkU32(0x000000FF))
+      );
+}
+
+static Bool dis_int_ldst_rev ( UInt theInstr )
+{
+   /* X-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UChar rD_addr = ifieldRegDS(theInstr);
+   UChar rS_addr = rD_addr;
+   UChar rA_addr = ifieldRegA(theInstr);
+   UChar rB_addr = ifieldRegB(theInstr);
+   UInt  opc2    = ifieldOPClo10(theInstr);
+   UChar b0      = ifieldBIT0(theInstr);
+
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp EA = newTemp(ty);
+   IRTemp w1 = newTemp(Ity_I32);
+   IRTemp w2 = newTemp(Ity_I32);
+
+   if (opc1 != 0x1F || b0 != 0) {
+      vex_printf("dis_int_ldst_rev(ppc)(opc1|b0)\n");
+      return False;
+   }
+
+   assign( EA, ea_rAor0_idxd( rA_addr, rB_addr ) );
+   
+   switch (opc2) {
+
+      case 0x316: // lhbrx (Load Halfword Byte-Reverse Indexed, PPC32 p449)
+         DIP("lhbrx r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         assign( w1, unop(Iop_16Uto32, loadBE(Ity_I16, mkexpr(EA))) );
+         assign( w2, gen_byterev16(w1) );
+         putIReg( rD_addr, mkWidenFrom32(ty, mkexpr(w2),
+                                         /* Signed */False) );
+         break;
+
+      case 0x216: // lwbrx (Load Word Byte-Reverse Indexed, PPC32 p459)
+         DIP("lwbrx r%u,r%u,r%u\n", rD_addr, rA_addr, rB_addr);
+         assign( w1, loadBE(Ity_I32, mkexpr(EA)) );
+         assign( w2, gen_byterev32(w1) );
+         putIReg( rD_addr, mkWidenFrom32(ty, mkexpr(w2),
+                                         /* Signed */False) );
+         break;
+      
+      case 0x396: // sthbrx (Store Half Word Byte-Reverse Indexed, PPC32 p523)
+         DIP("sthbrx r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+         assign( w1, mkNarrowTo32(ty, getIReg(rS_addr)) );
+         storeBE( mkexpr(EA), unop(Iop_32to16, gen_byterev16(w1)) );
+         break;
+      
+      case 0x296: // stwbrx (Store Word Byte-Reverse Indxd, PPC32 p531)
+         DIP("stwbrx r%u,r%u,r%u\n", rS_addr, rA_addr, rB_addr);
+         assign( w1, mkNarrowTo32(ty, getIReg(rS_addr)) );
+         storeBE( mkexpr(EA), gen_byterev32(w1) );
+         break;
+      
+      default:
+         vex_printf("dis_int_ldst_rev(ppc)(opc2)\n");
+         return False;
+   }
+   return True;
+}
+
+
+
+/*
+  Processor Control Instructions
+*/
+static Bool dis_proc_ctl ( VexAbiInfo* vbi, UInt theInstr )
+{
+   UChar opc1     = ifieldOPC(theInstr);
+   
+   /* X-Form */
+   UChar crfD     = toUChar( IFIELD( theInstr, 23, 3 ) );
+   UChar b21to22  = toUChar( IFIELD( theInstr, 21, 2 ) );
+   UChar rD_addr  = ifieldRegDS(theInstr);
+   UInt  b11to20  = IFIELD( theInstr, 11, 10 );
+
+   /* XFX-Form */
+   UChar rS_addr  = rD_addr;
+   UInt  SPR      = b11to20;
+   UInt  TBR      = b11to20;
+   UChar b20      = toUChar( IFIELD( theInstr, 20, 1 ) );
+   UInt  CRM      = IFIELD( theInstr, 12, 8 );
+   UChar b11      = toUChar( IFIELD( theInstr, 11, 1 ) );
+
+   UInt  opc2     = ifieldOPClo10(theInstr);
+   UChar b0       = ifieldBIT0(theInstr);
+
+   IRType ty = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp rS = newTemp(ty);
+   assign( rS, getIReg(rS_addr) );
+
+   /* Reorder SPR field as per PPC32 p470 */
+   SPR = ((SPR & 0x1F) << 5) | ((SPR >> 5) & 0x1F);
+   /* Reorder TBR field as per PPC32 p475 */
+   TBR = ((TBR & 31) << 5) | ((TBR >> 5) & 31);
+   
+   if (opc1 != 0x1F || b0 != 0) {
+      vex_printf("dis_proc_ctl(ppc)(opc1|b0)\n");
+      return False;
+   }
+   
+   switch (opc2) {
+   /* X-Form */
+   case 0x200: { // mcrxr (Move to Cond Register from XER, PPC32 p466)
+      if (b21to22 != 0 || b11to20 != 0) {
+         vex_printf("dis_proc_ctl(ppc)(mcrxr,b21to22|b11to20)\n");
+         return False;
+      }
+      DIP("mcrxr crf%d\n", crfD);
+      /* Move XER[0-3] (the top 4 bits of XER) to CR[crfD] */
+      putGST_field( PPC_GST_CR,
+                    getGST_field( PPC_GST_XER, 7 ),
+                    crfD );
+
+      // Clear XER[0-3]
+      putXER_SO( mkU8(0) );
+      putXER_OV( mkU8(0) );
+      putXER_CA( mkU8(0) );
+      break;
+   }
+      
+   case 0x013: 
+      // b11to20==0:      mfcr (Move from Cond Register, PPC32 p467)
+      // b20==1 & b11==0: mfocrf (Move from One CR Field)
+      // However it seems that the 'mfcr' behaviour is an acceptable
+      // implementation of mfocr (from the 2.02 arch spec)
+      if (b11to20 == 0) {
+         DIP("mfcr r%u\n", rD_addr);
+         putIReg( rD_addr, mkWidenFrom32(ty, getGST( PPC_GST_CR ),
+                                         /* Signed */False) );
+         break;
+      }
+      if (b20 == 1 && b11 == 0) {
+         DIP("mfocrf r%u,%u\n", rD_addr, CRM);
+         putIReg( rD_addr, mkWidenFrom32(ty, getGST( PPC_GST_CR ),
+                                         /* Signed */False) );
+         break;
+      }
+      /* not decodable */
+      return False;
+    
+   /* XFX-Form */
+   case 0x153: // mfspr (Move from Special-Purpose Register, PPC32 p470)
+      
+      switch (SPR) {  // Choose a register...
+      case 0x1:
+         DIP("mfxer r%u\n", rD_addr);
+         putIReg( rD_addr, mkWidenFrom32(ty, getGST( PPC_GST_XER ),
+                                         /* Signed */False) );
+         break;
+      case 0x8:
+         DIP("mflr r%u\n", rD_addr);
+         putIReg( rD_addr, getGST( PPC_GST_LR ) ); 
+         break;
+      case 0x9:
+         DIP("mfctr r%u\n", rD_addr);
+         putIReg( rD_addr, getGST( PPC_GST_CTR ) ); 
+         break;
+      case 0x100: 
+         DIP("mfvrsave r%u\n", rD_addr);
+         putIReg( rD_addr, mkWidenFrom32(ty, getGST( PPC_GST_VRSAVE ),
+                                         /* Signed */False) );
+         break;
+
+      case 0x103:
+         DIP("mfspr r%u, SPRG3(readonly)\n", rD_addr);
+         putIReg( rD_addr, getGST( PPC_GST_SPRG3_RO ) );
+         break;
+
+      /* Even a lowly PPC7400 can run the associated helper, so no
+         obvious need for feature testing at this point. */
+      case 268 /* 0x10C */:
+      case 269 /* 0x10D */: {
+         UInt     arg  = SPR==268 ? 0 : 1;
+         IRTemp   val  = newTemp(Ity_I32);
+         IRExpr** args = mkIRExprVec_1( mkU32(arg) );
+         IRDirty* d    = unsafeIRDirty_1_N(
+                            val,
+                            0/*regparms*/,
+                            "ppc32g_dirtyhelper_MFSPR_268_269",
+                            fnptr_to_fnentry
+                               (vbi, &ppc32g_dirtyhelper_MFSPR_268_269),
+                            args
+                         );
+         /* execute the dirty call, dumping the result in val. */
+         stmt( IRStmt_Dirty(d) );
+         putIReg( rD_addr,
+                  mkWidenFrom32(ty, mkexpr(val), False/*unsigned*/) );
+         DIP("mfspr r%u,%u", rD_addr, (UInt)SPR);
+         break;
+      }
+
+      /* Again, runs natively on PPC7400 (7447, really).  Not
+         bothering with a feature test. */
+      case 287: /* 0x11F */ {
+         IRTemp   val  = newTemp(Ity_I32);
+         IRExpr** args = mkIRExprVec_0();
+         IRDirty* d    = unsafeIRDirty_1_N(
+                            val,
+                            0/*regparms*/,
+                            "ppc32g_dirtyhelper_MFSPR_287",
+                            fnptr_to_fnentry
+                               (vbi, &ppc32g_dirtyhelper_MFSPR_287),
+                            args
+                         );
+         /* execute the dirty call, dumping the result in val. */
+         stmt( IRStmt_Dirty(d) );
+         putIReg( rD_addr,
+                  mkWidenFrom32(ty, mkexpr(val), False/*unsigned*/) );
+         DIP("mfspr r%u,%u", rD_addr, (UInt)SPR);
+         break;
+      }
+
+      default:
+         vex_printf("dis_proc_ctl(ppc)(mfspr,SPR)(0x%x)\n", SPR);
+         return False;
+      }
+      break;
+      
+   case 0x173: { // mftb (Move from Time Base, PPC32 p475)
+      IRTemp   val  = newTemp(Ity_I64);
+      IRExpr** args = mkIRExprVec_0();
+      IRDirty* d    = unsafeIRDirty_1_N(
+                              val, 
+                              0/*regparms*/, 
+                              "ppcg_dirtyhelper_MFTB", 
+                              fnptr_to_fnentry(vbi, &ppcg_dirtyhelper_MFTB), 
+                              args );
+      /* execute the dirty call, dumping the result in val. */
+      stmt( IRStmt_Dirty(d) );
+
+      switch (TBR) {
+      case 269: 
+         DIP("mftbu r%u", rD_addr);
+         putIReg( rD_addr,
+                  mkWidenFrom32(ty, unop(Iop_64HIto32, mkexpr(val)),
+                                /* Signed */False) );
+         break;
+      case 268: 
+         DIP("mftb r%u", rD_addr);
+         putIReg( rD_addr, (mode64) ? mkexpr(val) :
+                                      unop(Iop_64to32, mkexpr(val)) );
+         break;
+      default:
+         return False; /* illegal instruction */
+      }
+      break;
+   }
+
+   case 0x090: { 
+      // b20==0: mtcrf (Move to Cond Register Fields, PPC32 p477)
+      // b20==1: mtocrf (Move to One Cond Reg Field)
+      Int   cr;
+      UChar shft;
+      if (b11 != 0)
+         return False;
+      if (b20 == 1) {
+         /* ppc64 v2.02 spec says mtocrf gives undefined outcome if >
+            1 field is written.  It seems more robust to decline to
+            decode the insn if so. */
+         switch (CRM) {
+            case 0x01: case 0x02: case 0x04: case 0x08:
+            case 0x10: case 0x20: case 0x40: case 0x80:
+               break;
+            default: 
+               return False; 
+         }
+      }
+      DIP("%s 0x%x,r%u\n", b20==1 ? "mtocrf" : "mtcrf", 
+                           CRM, rS_addr);
+      /* Write to each field specified by CRM */
+      for (cr = 0; cr < 8; cr++) {
+         if ((CRM & (1 << (7-cr))) == 0)
+            continue;
+         shft = 4*(7-cr);
+         putGST_field( PPC_GST_CR,
+                       binop(Iop_Shr32,
+                             mkNarrowTo32(ty, mkexpr(rS)),
+                             mkU8(shft)), cr );
+      }
+      break;
+   }
+
+   case 0x1D3: // mtspr (Move to Special-Purpose Register, PPC32 p483)
+      
+      switch (SPR) {  // Choose a register...
+      case 0x1:
+         DIP("mtxer r%u\n", rS_addr);
+         putGST( PPC_GST_XER, mkNarrowTo32(ty, mkexpr(rS)) );
+         break;
+      case 0x8:
+         DIP("mtlr r%u\n", rS_addr);
+         putGST( PPC_GST_LR, mkexpr(rS) ); 
+         break;
+      case 0x9:
+         DIP("mtctr r%u\n", rS_addr);
+         putGST( PPC_GST_CTR, mkexpr(rS) ); 
+         break;
+      case 0x100:
+         DIP("mtvrsave r%u\n", rS_addr);
+         putGST( PPC_GST_VRSAVE, mkNarrowTo32(ty, mkexpr(rS)) );
+         break;
+         
+      default:
+         vex_printf("dis_proc_ctl(ppc)(mtspr,SPR)(%u)\n", SPR);
+         return False;
+      }
+      break;
+      
+   default:
+      vex_printf("dis_proc_ctl(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+
+/*
+  Cache Management Instructions
+*/
+static Bool dis_cache_manage ( UInt         theInstr, 
+                               DisResult*   dres,
+                               VexArchInfo* guest_archinfo )
+{
+   /* X-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UChar b21to25 = ifieldRegDS(theInstr);
+   UChar rA_addr = ifieldRegA(theInstr);
+   UChar rB_addr = ifieldRegB(theInstr);
+   UInt  opc2    = ifieldOPClo10(theInstr);
+   UChar b0      = ifieldBIT0(theInstr);
+   UInt  lineszB = guest_archinfo->ppc_cache_line_szB;
+   Bool  is_dcbzl = False;
+
+   IRType ty     = mode64 ? Ity_I64 : Ity_I32;
+
+   /* For dcbt, the lowest two bits of b21to25 encode an
+      access-direction hint (TH field) which we ignore.  Well, that's
+      what the PowerPC documentation says.  In fact xlc -O4 on POWER5
+      seems to generate values of 8 and 10 for b21to25. */
+   if (opc1 == 0x1F && opc2 == 0x116) {
+     /* b21to25 &= ~3; */ /* if the docs were true */
+     b21to25 = 0; /* blunt instrument */
+   }
+   if (opc1 == 0x1F && opc2 == 0x3F6) { // dcbz
+      if (b21to25 == 1) {
+         is_dcbzl = True;
+         b21to25 = 0;
+         if (!(guest_archinfo->ppc_dcbzl_szB)) {
+            vex_printf("dis_cache_manage(ppc)(dcbzl not supported by host)\n");
+            return False;
+         }
+      }
+   }
+
+   if (opc1 != 0x1F || b21to25 != 0 || b0 != 0) {
+      if (0) vex_printf("dis_cache_manage %d %d %d\n", 
+                        (Int)opc1, (Int)b21to25, (Int)b0);
+      vex_printf("dis_cache_manage(ppc)(opc1|b21to25|b0)\n");
+      return False;
+   }
+
+   /* stay sane .. */
+   vassert(lineszB == 32 || lineszB == 64 || lineszB == 128);
+   
+   switch (opc2) {
+//zz    case 0x2F6: // dcba (Data Cache Block Allocate, PPC32 p380)
+//zz       vassert(0); /* AWAITING TEST CASE */
+//zz       DIP("dcba r%u,r%u\n", rA_addr, rB_addr);
+//zz       if (0) vex_printf("vex ppc->IR: kludged dcba\n");
+//zz       break;
+      
+   case 0x056: // dcbf (Data Cache Block Flush, PPC32 p382)
+      DIP("dcbf r%u,r%u\n", rA_addr, rB_addr);
+      /* nop as far as vex is concerned */
+      break;
+      
+   case 0x036: // dcbst (Data Cache Block Store, PPC32 p384)
+      DIP("dcbst r%u,r%u\n", rA_addr, rB_addr);
+      /* nop as far as vex is concerned */
+      break;
+
+   case 0x116: // dcbt (Data Cache Block Touch, PPC32 p385)
+      DIP("dcbt r%u,r%u\n", rA_addr, rB_addr);
+      /* nop as far as vex is concerned */
+      break;
+      
+   case 0x0F6: // dcbtst (Data Cache Block Touch for Store, PPC32 p386)
+      DIP("dcbtst r%u,r%u\n", rA_addr, rB_addr);
+      /* nop as far as vex is concerned */
+      break;
+      
+   case 0x3F6: { // dcbz (Data Cache Block Clear to Zero, PPC32 p387)
+                 // dcbzl (Data Cache Block Clear to Zero Long, bug#135264)
+      /* Clear all bytes in cache block at (rA|0) + rB. */
+      IRTemp  EA   = newTemp(ty);
+      IRTemp  addr = newTemp(ty);
+      IRExpr* irx_addr;
+      UInt    i;
+      UInt clearszB;
+      if (is_dcbzl) {
+          clearszB = guest_archinfo->ppc_dcbzl_szB;
+          DIP("dcbzl r%u,r%u\n", rA_addr, rB_addr);
+      }
+      else {
+          clearszB = guest_archinfo->ppc_dcbz_szB;
+          DIP("dcbz r%u,r%u\n", rA_addr, rB_addr);
+      }
+
+      assign( EA, ea_rAor0_idxd(rA_addr, rB_addr) );
+
+      if (mode64) {
+         /* Round EA down to the start of the containing block. */
+         assign( addr, binop( Iop_And64,
+                              mkexpr(EA),
+                              mkU64( ~((ULong)clearszB-1) )) );
+         
+         for (i = 0; i < clearszB / 8; i++) {
+            irx_addr = binop( Iop_Add64, mkexpr(addr), mkU64(i*8) );
+            storeBE( irx_addr, mkU64(0) );
+         }
+      } else {
+         /* Round EA down to the start of the containing block. */
+         assign( addr, binop( Iop_And32,
+                              mkexpr(EA),
+                              mkU32( ~(clearszB-1) )) );
+         
+         for (i = 0; i < clearszB / 4; i++) {
+            irx_addr = binop( Iop_Add32, mkexpr(addr), mkU32(i*4) );
+            storeBE( irx_addr, mkU32(0) );
+         }
+      }
+      break;
+   }
+
+   case 0x3D6: { 
+      // icbi (Instruction Cache Block Invalidate, PPC32 p431)
+      /* Invalidate all translations containing code from the cache
+         block at (rA|0) + rB. */
+      IRTemp EA   = newTemp(ty);
+      IRTemp addr = newTemp(ty);
+      DIP("icbi r%u,r%u\n", rA_addr, rB_addr);
+      assign( EA, ea_rAor0_idxd(rA_addr, rB_addr) );
+
+      /* Round EA down to the start of the containing block. */
+      assign( addr, binop( mkSzOp(ty, Iop_And8),
+                           mkexpr(EA),
+                           mkSzImm(ty, ~(((ULong)lineszB)-1) )) );
+      putGST( PPC_GST_TISTART, mkexpr(addr) );
+      putGST( PPC_GST_TILEN, mkSzImm(ty, lineszB) );
+
+      /* be paranoid ... */
+      stmt( IRStmt_MBE(Imbe_Fence) );
+
+      irsb->jumpkind = Ijk_TInval;
+      irsb->next     = mkSzImm(ty, nextInsnAddr());
+      dres->whatNext = Dis_StopHere;
+      break;
+   }
+
+   default:
+      vex_printf("dis_cache_manage(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Floating Point Helpers                               ---*/
+/*------------------------------------------------------------*/
+
+/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
+/* Produces a value in 0 .. 3, which is encoded as per the type
+   IRRoundingMode.  PPCRoundingMode encoding is different to
+   IRRoundingMode, so need to map it.
+*/
+static IRExpr* /* :: Ity_I32 */ get_IR_roundingmode ( void )
+{
+/* 
+   rounding mode | PPC | IR
+   ------------------------
+   to nearest    | 00  | 00
+   to zero       | 01  | 11
+   to +infinity  | 10  | 10
+   to -infinity  | 11  | 01
+*/
+   IRTemp rm_PPC32 = newTemp(Ity_I32);
+   assign( rm_PPC32, getGST_masked( PPC_GST_FPSCR, MASK_FPSCR_RN ) );
+
+   // rm_IR = XOR( rm_PPC32, (rm_PPC32 << 1) & 2)
+   return binop( Iop_Xor32, 
+                 mkexpr(rm_PPC32),
+                 binop( Iop_And32, 
+                        binop(Iop_Shl32, mkexpr(rm_PPC32), mkU8(1)),
+                        mkU32(2) ));
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Floating Point Instruction Translation               ---*/
+/*------------------------------------------------------------*/
+
+/*
+  Floating Point Load Instructions
+*/
+static Bool dis_fp_load ( UInt theInstr )
+{
+   /* X-Form, D-Form */
+   UChar opc1      = ifieldOPC(theInstr);
+   UChar frD_addr  = ifieldRegDS(theInstr);
+   UChar rA_addr   = ifieldRegA(theInstr);
+   UChar rB_addr   = ifieldRegB(theInstr);
+   UInt  opc2      = ifieldOPClo10(theInstr);
+   UChar b0        = ifieldBIT0(theInstr);
+   UInt  uimm16    = ifieldUIMM16(theInstr);
+
+   Int    simm16 = extend_s_16to32(uimm16);
+   IRType ty     = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp EA     = newTemp(ty);
+   IRTemp rA     = newTemp(ty);
+   IRTemp rB     = newTemp(ty);
+   IRTemp iHi    = newTemp(Ity_I32);
+   IRTemp iLo    = newTemp(Ity_I32);
+
+   assign( rA, getIReg(rA_addr) );
+   assign( rB, getIReg(rB_addr) );
+
+   /* These are completely straightforward from a rounding and status
+      bits perspective: no rounding involved and no funny status or CR
+      bits affected. */
+
+   switch (opc1) {
+   case 0x30: // lfs (Load Float Single, PPC32 p441)
+      DIP("lfs fr%u,%d(r%u)\n", frD_addr, simm16, rA_addr);
+      assign( EA, ea_rAor0_simm(rA_addr, simm16) );
+      putFReg( frD_addr,
+               unop(Iop_F32toF64, loadBE(Ity_F32, mkexpr(EA))) );
+      break;
+
+   case 0x31: // lfsu (Load Float Single, Update, PPC32 p442)
+      if (rA_addr == 0)
+         return False;
+      DIP("lfsu fr%u,%d(r%u)\n", frD_addr, simm16, rA_addr);
+      assign( EA, ea_rA_simm(rA_addr, simm16) );
+      putFReg( frD_addr,
+               unop(Iop_F32toF64, loadBE(Ity_F32, mkexpr(EA))) );
+      putIReg( rA_addr, mkexpr(EA) );
+      break;
+      
+   case 0x32: // lfd (Load Float Double, PPC32 p437)
+      DIP("lfd fr%u,%d(r%u)\n", frD_addr, simm16, rA_addr);
+      assign( EA, ea_rAor0_simm(rA_addr, simm16) );
+      putFReg( frD_addr, loadBE(Ity_F64, mkexpr(EA)) );
+      break;
+
+   case 0x33: // lfdu (Load Float Double, Update, PPC32 p438)
+      if (rA_addr == 0)
+         return False;
+      DIP("lfdu fr%u,%d(r%u)\n", frD_addr, simm16, rA_addr);
+      assign( EA, ea_rA_simm(rA_addr, simm16) );
+      putFReg( frD_addr, loadBE(Ity_F64, mkexpr(EA)) );
+      putIReg( rA_addr, mkexpr(EA) );
+      break;
+
+   case 0x1F:
+      if (b0 != 0) {
+         vex_printf("dis_fp_load(ppc)(instr,b0)\n");
+         return False;
+      }
+
+      switch(opc2) {
+      case 0x217: // lfsx (Load Float Single Indexed, PPC32 p444)
+         DIP("lfsx fr%u,r%u,r%u\n", frD_addr, rA_addr, rB_addr);
+         assign( EA, ea_rAor0_idxd(rA_addr, rB_addr) );
+         putFReg( frD_addr, unop( Iop_F32toF64, 
+                                  loadBE(Ity_F32, mkexpr(EA))) );
+         break;
+         
+      case 0x237: // lfsux (Load Float Single, Update Indxd, PPC32 p443)
+         if (rA_addr == 0)
+            return False;
+         DIP("lfsux fr%u,r%u,r%u\n", frD_addr, rA_addr, rB_addr);
+         assign( EA, ea_rA_idxd(rA_addr, rB_addr) );
+         putFReg( frD_addr,
+                  unop(Iop_F32toF64, loadBE(Ity_F32, mkexpr(EA))) );
+         putIReg( rA_addr, mkexpr(EA) );
+         break;
+         
+      case 0x257: // lfdx (Load Float Double Indexed, PPC32 p440)
+         DIP("lfdx fr%u,r%u,r%u\n", frD_addr, rA_addr, rB_addr);
+         assign( EA, ea_rAor0_idxd(rA_addr, rB_addr) );
+         putFReg( frD_addr, loadBE(Ity_F64, mkexpr(EA)) );
+         break;
+         
+      case 0x277: // lfdux (Load Float Double, Update Indxd, PPC32 p439)
+         if (rA_addr == 0)
+            return False;
+         DIP("lfdux fr%u,r%u,r%u\n", frD_addr, rA_addr, rB_addr);
+         assign( EA, ea_rA_idxd(rA_addr, rB_addr) );
+         putFReg( frD_addr, loadBE(Ity_F64, mkexpr(EA)) );
+         putIReg( rA_addr, mkexpr(EA) );
+         break;
+         
+      case 0x357: // lfiwax (Load Float As Integer, Indxd, ISA 2.05 p120)
+         DIP("lfiwax fr%u,r%u,r%u\n", frD_addr, rA_addr, rB_addr);
+         assign( EA, ea_rAor0_idxd( rA_addr, rB_addr ) );
+         assign( iLo, loadBE(Ity_I32, mkexpr(EA)) );
+         assign( iHi, binop(Iop_Sub32,
+                            mkU32(0),
+                            binop(Iop_Shr32, mkexpr(iLo), mkU8(31)))  );
+         putFReg( frD_addr, unop(Iop_ReinterpI64asF64,
+                                 binop(Iop_32HLto64, mkexpr(iHi), mkexpr(iLo))) );
+         break;
+
+      default:
+         vex_printf("dis_fp_load(ppc)(opc2)\n");
+         return False;
+      }
+      break;
+
+   default:
+      vex_printf("dis_fp_load(ppc)(opc1)\n");
+      return False;
+   }
+   return True;
+}
+
+
+
+/*
+  Floating Point Store Instructions
+*/
+static Bool dis_fp_store ( UInt theInstr )
+{
+   /* X-Form, D-Form */
+   UChar opc1      = ifieldOPC(theInstr);
+   UChar frS_addr  = ifieldRegDS(theInstr);
+   UChar rA_addr   = ifieldRegA(theInstr);
+   UChar rB_addr   = ifieldRegB(theInstr);
+   UInt  opc2      = ifieldOPClo10(theInstr);
+   UChar b0        = ifieldBIT0(theInstr);
+   Int   uimm16    = ifieldUIMM16(theInstr);
+
+   Int    simm16 = extend_s_16to32(uimm16);
+   IRTemp frS    = newTemp(Ity_F64);
+   IRType ty     = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp EA     = newTemp(ty);
+   IRTemp rA     = newTemp(ty);
+   IRTemp rB     = newTemp(ty);
+
+   assign( frS, getFReg(frS_addr) );
+   assign( rA,  getIReg(rA_addr) );
+   assign( rB,  getIReg(rB_addr) );
+
+   /* These are straightforward from a status bits perspective: no
+      funny status or CR bits affected.  For single precision stores,
+      the values are truncated and denormalised (not rounded) to turn
+      them into single precision values. */
+
+   switch (opc1) {
+
+   case 0x34: // stfs (Store Float Single, PPC32 p518)
+      DIP("stfs fr%u,%d(r%u)\n", frS_addr, simm16, rA_addr);
+      assign( EA, ea_rAor0_simm(rA_addr, simm16) );
+      /* Use Iop_TruncF64asF32 to truncate and possible denormalise
+         the value to be stored in the correct way, without any
+         rounding. */
+      storeBE( mkexpr(EA),
+               unop(Iop_TruncF64asF32, mkexpr(frS)) );
+      break;
+
+   case 0x35: // stfsu (Store Float Single, Update, PPC32 p519)
+      if (rA_addr == 0)
+         return False;
+      DIP("stfsu fr%u,%d(r%u)\n", frS_addr, simm16, rA_addr);
+      assign( EA, ea_rA_simm(rA_addr, simm16) );
+      /* See comment for stfs */
+      storeBE( mkexpr(EA),
+               unop(Iop_TruncF64asF32, mkexpr(frS)) );
+      putIReg( rA_addr, mkexpr(EA) );
+      break;
+
+   case 0x36: // stfd (Store Float Double, PPC32 p513)
+      DIP("stfd fr%u,%d(r%u)\n", frS_addr, simm16, rA_addr);
+      assign( EA, ea_rAor0_simm(rA_addr, simm16) );
+      storeBE( mkexpr(EA), mkexpr(frS) );
+      break;
+
+   case 0x37: // stfdu (Store Float Double, Update, PPC32 p514)
+      if (rA_addr == 0)
+         return False;
+      DIP("stfdu fr%u,%d(r%u)\n", frS_addr, simm16, rA_addr);
+      assign( EA, ea_rA_simm(rA_addr, simm16) );
+      storeBE( mkexpr(EA), mkexpr(frS) );
+      putIReg( rA_addr, mkexpr(EA) );
+      break;
+
+   case 0x1F:
+      if (b0 != 0) {
+         vex_printf("dis_fp_store(ppc)(instr,b0)\n");
+         return False;
+      }
+      switch(opc2) {
+      case 0x297: // stfsx (Store Float Single Indexed, PPC32 p521)
+         DIP("stfsx fr%u,r%u,r%u\n", frS_addr, rA_addr, rB_addr);
+         assign( EA, ea_rAor0_idxd(rA_addr, rB_addr) );
+         /* See note for stfs */
+         storeBE( mkexpr(EA), 
+                  unop(Iop_TruncF64asF32, mkexpr(frS)) );
+         break;
+         
+      case 0x2B7: // stfsux (Store Float Sgl, Update Indxd, PPC32 p520)
+         if (rA_addr == 0)
+            return False;
+         DIP("stfsux fr%u,r%u,r%u\n", frS_addr, rA_addr, rB_addr);
+         assign( EA, ea_rA_idxd(rA_addr, rB_addr) );
+         /* See note for stfs */
+         storeBE( mkexpr(EA), 
+                  unop(Iop_TruncF64asF32, mkexpr(frS)) );
+         putIReg( rA_addr, mkexpr(EA) );
+         break;
+
+      case 0x2D7: // stfdx (Store Float Double Indexed, PPC32 p516)
+         DIP("stfdx fr%u,r%u,r%u\n", frS_addr, rA_addr, rB_addr);
+         assign( EA, ea_rAor0_idxd(rA_addr, rB_addr) );
+         storeBE( mkexpr(EA), mkexpr(frS) );
+         break;
+         
+      case 0x2F7: // stfdux (Store Float Dbl, Update Indxd, PPC32 p515)
+         if (rA_addr == 0)
+            return False;
+         DIP("stfdux fr%u,r%u,r%u\n", frS_addr, rA_addr, rB_addr);
+         assign( EA, ea_rA_idxd(rA_addr, rB_addr) );
+         storeBE( mkexpr(EA), mkexpr(frS) );
+         putIReg( rA_addr, mkexpr(EA) );
+         break;
+
+      case 0x3D7: // stfiwx (Store Float as Int, Indexed, PPC32 p517)
+         // NOTE: POWERPC OPTIONAL, "Graphics Group" (PPC32_GX)
+         DIP("stfiwx fr%u,r%u,r%u\n", frS_addr, rA_addr, rB_addr);
+         assign( EA, ea_rAor0_idxd(rA_addr, rB_addr) );
+         storeBE( mkexpr(EA),
+                  unop(Iop_64to32, unop(Iop_ReinterpF64asI64, mkexpr(frS))) );
+         break;
+
+      default:
+         vex_printf("dis_fp_store(ppc)(opc2)\n");
+         return False;
+      }
+      break;
+
+   default:
+      vex_printf("dis_fp_store(ppc)(opc1)\n");
+      return False;
+   }
+   return True;
+}
+
+
+
+/*
+  Floating Point Arith Instructions
+*/
+static Bool dis_fp_arith ( UInt theInstr )
+{
+   /* A-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar frD_addr = ifieldRegDS(theInstr);
+   UChar frA_addr = ifieldRegA(theInstr);
+   UChar frB_addr = ifieldRegB(theInstr);
+   UChar frC_addr = ifieldRegC(theInstr);
+   UChar opc2     = ifieldOPClo5(theInstr);
+   UChar flag_rC  = ifieldBIT0(theInstr);
+
+   IRTemp  frD = newTemp(Ity_F64);
+   IRTemp  frA = newTemp(Ity_F64);
+   IRTemp  frB = newTemp(Ity_F64);
+   IRTemp  frC = newTemp(Ity_F64);
+   IRExpr* rm  = get_IR_roundingmode();
+
+   /* By default, we will examine the results of the operation and set
+      fpscr[FPRF] accordingly. */
+   Bool set_FPRF = True;
+
+   /* By default, if flag_RC is set, we will clear cr1 after the
+      operation.  In reality we should set cr1 to indicate the
+      exception status of the operation, but since we're not
+      simulating exceptions, the exception status will appear to be
+      zero.  Hence cr1 should be cleared if this is a . form insn. */
+   Bool clear_CR1 = True;
+
+   assign( frA, getFReg(frA_addr));
+   assign( frB, getFReg(frB_addr));
+   assign( frC, getFReg(frC_addr));
+
+   switch (opc1) {
+   case 0x3B:
+      switch (opc2) {
+      case 0x12: // fdivs (Floating Divide Single, PPC32 p407)
+         if (frC_addr != 0)
+            return False;
+         DIP("fdivs%s fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frB_addr);
+         assign( frD, triop( Iop_DivF64r32, 
+                             rm, mkexpr(frA), mkexpr(frB) ));
+         break;
+
+      case 0x14: // fsubs (Floating Subtract Single, PPC32 p430)
+         if (frC_addr != 0)
+            return False;
+         DIP("fsubs%s fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frB_addr);
+         assign( frD, triop( Iop_SubF64r32, 
+                             rm, mkexpr(frA), mkexpr(frB) ));
+         break;
+
+      case 0x15: // fadds (Floating Add Single, PPC32 p401)
+         if (frC_addr != 0)
+            return False;
+         DIP("fadds%s fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frB_addr);
+         assign( frD, triop( Iop_AddF64r32, 
+                             rm, mkexpr(frA), mkexpr(frB) ));
+         break;
+
+      case 0x16: // fsqrts (Floating SqRt (Single-Precision), PPC32 p428)
+         // NOTE: POWERPC OPTIONAL, "General-Purpose Group" (PPC32_FX)
+         if (frA_addr != 0 || frC_addr != 0)
+            return False;
+         DIP("fsqrts%s fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frB_addr);
+         // however illogically, on ppc970 this insn behaves identically
+         // to fsqrt (double-precision).  So use SqrtF64, not SqrtF64r32.
+         assign( frD, binop( Iop_SqrtF64, rm, mkexpr(frB) ));
+         break;
+
+      case 0x18: // fres (Floating Reciprocal Estimate Single, PPC32 p421)
+         // NOTE: POWERPC OPTIONAL, "Graphics Group" (PPC32_GX)
+         if (frA_addr != 0 || frC_addr != 0)
+            return False;
+         DIP("fres%s fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frB_addr);
+         { IRExpr* ieee_one
+              = IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL));
+           assign( frD, triop( Iop_DivF64r32, 
+                               rm,
+                               ieee_one, mkexpr(frB) ));
+         }
+         break;
+
+      case 0x19: // fmuls (Floating Multiply Single, PPC32 p414)
+         if (frB_addr != 0)
+            return False;
+         DIP("fmuls%s fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frC_addr);
+         assign( frD, triop( Iop_MulF64r32,
+                             rm, mkexpr(frA), mkexpr(frC) ));
+         break;
+
+      case 0x1A: // frsqrtes (Floating Recip SqRt Est Single)
+         // NOTE: POWERPC OPTIONAL, "Graphics Group" (PPC32_GX)
+         // Undocumented instruction?
+         if (frA_addr != 0 || frC_addr != 0)
+            return False;
+         DIP("frsqrtes%s fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frB_addr);
+         assign( frD, unop(Iop_Est5FRSqrt, mkexpr(frB)) );
+         break;
+
+      default:
+         vex_printf("dis_fp_arith(ppc)(3B: opc2)\n");
+         return False;
+      }
+      break;
+
+   case 0x3F:
+      switch (opc2) {           
+      case 0x12: // fdiv (Floating Div (Double-Precision), PPC32 p406)
+         if (frC_addr != 0)
+            return False;
+         DIP("fdiv%s fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frB_addr);
+         assign( frD, triop(Iop_DivF64, rm, mkexpr(frA), mkexpr(frB)) );
+         break;
+
+      case 0x14: // fsub (Floating Sub (Double-Precision), PPC32 p429)
+         if (frC_addr != 0)
+            return False;
+         DIP("fsub%s fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frB_addr);
+         assign( frD, triop(Iop_SubF64, rm, mkexpr(frA), mkexpr(frB)) );
+         break;
+
+      case 0x15: // fadd (Floating Add (Double-Precision), PPC32 p400)
+         if (frC_addr != 0)
+            return False;
+         DIP("fadd%s fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frB_addr);
+         assign( frD, triop(Iop_AddF64, rm, mkexpr(frA), mkexpr(frB)) );
+         break;
+
+      case 0x16: // fsqrt (Floating SqRt (Double-Precision), PPC32 p427)
+         // NOTE: POWERPC OPTIONAL, "General-Purpose Group" (PPC32_FX)
+         if (frA_addr != 0 || frC_addr != 0)
+            return False;
+         DIP("fsqrt%s fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frB_addr);
+         assign( frD, binop(Iop_SqrtF64, rm, mkexpr(frB)) );
+         break;
+
+      case 0x17: { // fsel (Floating Select, PPC32 p426)
+         // NOTE: POWERPC OPTIONAL, "Graphics Group" (PPC32_GX)
+         IRTemp cc    = newTemp(Ity_I32);
+         IRTemp cc_b0 = newTemp(Ity_I32);
+
+         DIP("fsel%s fr%u,fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frC_addr, frB_addr);
+
+         // cc: UN == 0x41, LT == 0x01, GT == 0x00, EQ == 0x40
+         // => GT|EQ == (cc & 0x1 == 0)
+         assign( cc, binop(Iop_CmpF64, mkexpr(frA),
+                                       IRExpr_Const(IRConst_F64(0))) );
+         assign( cc_b0, binop(Iop_And32, mkexpr(cc), mkU32(1)) );
+
+         // frD = (frA >= 0.0) ? frC : frB
+         //     = (cc_b0 == 0) ? frC : frB
+         assign( frD,
+                 IRExpr_Mux0X(
+                    unop(Iop_1Uto8,
+                         binop(Iop_CmpEQ32, mkexpr(cc_b0), mkU32(0))),
+                    mkexpr(frB),
+                    mkexpr(frC) ));
+
+         /* One of the rare ones which don't mess with FPRF */
+         set_FPRF = False;
+         break;
+      }
+
+      case 0x18: // fre (Floating Reciprocal Estimate)
+         // NOTE: POWERPC OPTIONAL, "Graphics Group" (PPC32_GX)
+         // Note: unclear whether this insn really exists or not
+         // ppc970 doesn't have it, but POWER5 does
+         if (frA_addr != 0 || frC_addr != 0)
+            return False;
+         DIP("fre%s fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frB_addr);
+         { IRExpr* ieee_one
+              = IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL));
+           assign( frD, triop( Iop_DivF64, 
+                               rm,
+                               ieee_one, mkexpr(frB) ));
+         }
+         break;
+
+      case 0x19: // fmul (Floating Mult (Double Precision), PPC32 p413)
+         if (frB_addr != 0)
+            vex_printf("dis_fp_arith(ppc)(instr,fmul)\n");
+         DIP("fmul%s fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frC_addr);
+         assign( frD, triop(Iop_MulF64, rm, mkexpr(frA), mkexpr(frC)) );
+         break;
+
+      case 0x1A: // frsqrte (Floating Recip SqRt Est., PPC32 p424)
+         // NOTE: POWERPC OPTIONAL, "Graphics Group" (PPC32_GX)
+         if (frA_addr != 0 || frC_addr != 0)
+            return False;
+         DIP("frsqrte%s fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frB_addr);
+         assign( frD, unop(Iop_Est5FRSqrt, mkexpr(frB)) );
+         break;
+
+      default:
+         vex_printf("dis_fp_arith(ppc)(3F: opc2)\n");
+         return False;
+      }
+      break;
+
+   default:
+      vex_printf("dis_fp_arith(ppc)(opc1)\n");
+      return False;
+   }
+
+   putFReg( frD_addr, mkexpr(frD) );
+
+   if (set_FPRF) {
+      // XXX XXX XXX FIXME
+      // set FPRF from frD
+   }
+
+   if (flag_rC && clear_CR1) {
+      putCR321( 1, mkU8(0) );
+      putCR0( 1, mkU8(0) );
+   }
+
+   return True;
+}
+
+
+
+/*
+  Floating Point Mult-Add Instructions
+*/
+static Bool dis_fp_multadd ( UInt theInstr )
+{
+   /* A-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar frD_addr = ifieldRegDS(theInstr);
+   UChar frA_addr = ifieldRegA(theInstr);
+   UChar frB_addr = ifieldRegB(theInstr);
+   UChar frC_addr = ifieldRegC(theInstr);
+   UChar opc2     = ifieldOPClo5(theInstr);
+   UChar flag_rC  = ifieldBIT0(theInstr);
+
+   IRTemp  frD = newTemp(Ity_F64);
+   IRTemp  frA = newTemp(Ity_F64);
+   IRTemp  frB = newTemp(Ity_F64);
+   IRTemp  frC = newTemp(Ity_F64);
+   IRTemp  rmt = newTemp(Ity_I32);
+   IRExpr* rm;
+
+   /* By default, we will examine the results of the operation and set
+      fpscr[FPRF] accordingly. */
+   Bool set_FPRF = True;
+
+   /* By default, if flag_RC is set, we will clear cr1 after the
+      operation.  In reality we should set cr1 to indicate the
+      exception status of the operation, but since we're not
+      simulating exceptions, the exception status will appear to be
+      zero.  Hence cr1 should be cleared if this is a . form insn. */
+   Bool clear_CR1 = True;
+
+   /* Bind the rounding mode expression to a temp; there's no
+      point in creating gratuitous CSEs, as we know we'll need 
+      to use it twice. */
+   assign( rmt, get_IR_roundingmode() );
+   rm = mkexpr(rmt);
+
+   assign( frA, getFReg(frA_addr));
+   assign( frB, getFReg(frB_addr));
+   assign( frC, getFReg(frC_addr));
+
+   /* The rounding in this is all a bit dodgy.  The idea is to only do
+      one rounding.  That clearly isn't achieveable without dedicated
+      four-input IR primops, although in the single precision case we
+      can sort-of simulate it by doing the inner multiply in double
+      precision. 
+
+      In the negated cases, the negation happens after rounding. */
+
+   switch (opc1) {
+   case 0x3B:
+      switch (opc2) {
+      case 0x1C: // fmsubs (Floating Mult-Subtr Single, PPC32 p412)
+         DIP("fmsubs%s fr%u,fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frC_addr, frB_addr);
+         assign( frD, qop( Iop_MSubF64r32, rm,
+                           mkexpr(frA), mkexpr(frC), mkexpr(frB) ));
+         break;
+
+      case 0x1D: // fmadds (Floating Mult-Add Single, PPC32 p409)
+         DIP("fmadds%s fr%u,fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frC_addr, frB_addr);
+         assign( frD, qop( Iop_MAddF64r32, rm,
+                           mkexpr(frA), mkexpr(frC), mkexpr(frB) ));
+         break;
+
+      case 0x1E: // fnmsubs (Float Neg Mult-Subtr Single, PPC32 p420)
+         DIP("fnmsubs%s fr%u,fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frC_addr, frB_addr);
+         assign( frD, unop( Iop_NegF64,
+                      qop( Iop_MSubF64r32, rm,
+                           mkexpr(frA), mkexpr(frC), mkexpr(frB) )));
+         break;
+
+      case 0x1F: // fnmadds (Floating Negative Multiply-Add Single, PPC32 p418)
+         DIP("fnmadds%s fr%u,fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frC_addr, frB_addr);
+         assign( frD, unop( Iop_NegF64,
+                      qop( Iop_MAddF64r32, rm,
+                           mkexpr(frA), mkexpr(frC), mkexpr(frB) )));
+         break;
+
+      default:
+         vex_printf("dis_fp_multadd(ppc)(3B: opc2)\n");
+         return False;
+      }
+      break;
+
+   case 0x3F:
+      switch (opc2) {           
+      case 0x1C: // fmsub (Float Mult-Sub (Dbl Precision), PPC32 p411)
+         DIP("fmsub%s fr%u,fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frC_addr, frB_addr);
+         assign( frD, qop( Iop_MSubF64, rm,
+                           mkexpr(frA), mkexpr(frC), mkexpr(frB) ));
+         break;
+
+      case 0x1D: // fmadd (Float Mult-Add (Dbl Precision), PPC32 p408)
+         DIP("fmadd%s fr%u,fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frC_addr, frB_addr);
+         assign( frD, qop( Iop_MAddF64, rm,
+                           mkexpr(frA), mkexpr(frC), mkexpr(frB) ));
+         break;
+
+      case 0x1E: // fnmsub (Float Neg Mult-Subtr (Dbl Precision), PPC32 p419)
+         DIP("fnmsub%s fr%u,fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frC_addr, frB_addr);
+         assign( frD, unop( Iop_NegF64,
+                      qop( Iop_MSubF64, rm,
+                           mkexpr(frA), mkexpr(frC), mkexpr(frB) )));
+         break;
+
+      case 0x1F: // fnmadd (Float Neg Mult-Add (Dbl Precision), PPC32 p417)
+         DIP("fnmadd%s fr%u,fr%u,fr%u,fr%u\n", flag_rC ? ".":"",
+             frD_addr, frA_addr, frC_addr, frB_addr);
+         assign( frD, unop( Iop_NegF64,
+                      qop( Iop_MAddF64, rm,
+                           mkexpr(frA), mkexpr(frC), mkexpr(frB) )));
+         break;
+
+      default:
+         vex_printf("dis_fp_multadd(ppc)(3F: opc2)\n");
+         return False;
+      }
+      break;
+
+   default:
+      vex_printf("dis_fp_multadd(ppc)(opc1)\n");
+      return False;
+   }
+
+   putFReg( frD_addr, mkexpr(frD) );
+
+   if (set_FPRF) {
+      // XXX XXX XXX FIXME
+      // set FPRF from frD
+   }
+
+   if (flag_rC && clear_CR1) {
+      putCR321( 1, mkU8(0) );
+      putCR0( 1, mkU8(0) );
+   }
+
+   return True;
+}
+
+
+
+/*
+  Floating Point Compare Instructions
+*/
+static Bool dis_fp_cmp ( UInt theInstr )
+{   
+   /* X-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar crfD     = toUChar( IFIELD( theInstr, 23, 3 ) );
+   UChar b21to22  = toUChar( IFIELD( theInstr, 21, 2 ) );
+   UChar frA_addr = ifieldRegA(theInstr);
+   UChar frB_addr = ifieldRegB(theInstr);
+   UInt  opc2     = ifieldOPClo10(theInstr);
+   UChar b0       = ifieldBIT0(theInstr);
+
+   IRTemp ccIR    = newTemp(Ity_I32);
+   IRTemp ccPPC32 = newTemp(Ity_I32);
+
+   IRTemp frA     = newTemp(Ity_F64);
+   IRTemp frB     = newTemp(Ity_F64);
+
+   if (opc1 != 0x3F || b21to22 != 0 || b0 != 0) {
+      vex_printf("dis_fp_cmp(ppc)(instr)\n");
+      return False;
+   }
+
+   assign( frA, getFReg(frA_addr));
+   assign( frB, getFReg(frB_addr));
+
+   assign( ccIR, binop(Iop_CmpF64, mkexpr(frA), mkexpr(frB)) );
+   
+   /* Map compare result from IR to PPC32 */
+   /*
+     FP cmp result | PPC | IR
+     --------------------------
+     UN            | 0x1 | 0x45
+     EQ            | 0x2 | 0x40
+     GT            | 0x4 | 0x00
+     LT            | 0x8 | 0x01
+   */
+
+   // ccPPC32 = Shl(1, (~(ccIR>>5) & 2) 
+   //                    | ((ccIR ^ (ccIR>>6)) & 1)
+   assign(
+      ccPPC32,
+      binop(
+         Iop_Shl32, 
+         mkU32(1),
+         unop(
+            Iop_32to8, 
+            binop(
+               Iop_Or32,
+               binop(
+                  Iop_And32, 
+                  unop(
+                     Iop_Not32,
+                     binop(Iop_Shr32, mkexpr(ccIR), mkU8(5))
+                  ),
+                  mkU32(2)
+               ),
+               binop(
+                  Iop_And32, 
+                  binop(
+                     Iop_Xor32, 
+                     mkexpr(ccIR),
+                     binop(Iop_Shr32, mkexpr(ccIR), mkU8(6))
+                  ),
+                  mkU32(1)
+               )
+            )
+         )
+      )
+   );
+
+   putGST_field( PPC_GST_CR, mkexpr(ccPPC32), crfD );
+
+   /* CAB: TODO?: Support writing cc to FPSCR->FPCC ?
+      putGST_field( PPC_GST_FPSCR, mkexpr(ccPPC32), 4 );
+   */
+   // XXX XXX XXX FIXME
+   // Also write the result into FPRF (it's not entirely clear how)
+
+   /* Note: Differences between fcmpu and fcmpo are only in exception
+      flag settings, which aren't supported anyway. */
+   switch (opc2) {
+   case 0x000: // fcmpu (Floating Compare Unordered, PPC32 p403)
+      DIP("fcmpu crf%d,fr%u,fr%u\n", crfD, frA_addr, frB_addr);
+      break;
+   case 0x020: // fcmpo (Floating Compare Ordered, PPC32 p402)
+      DIP("fcmpo crf%d,fr%u,fr%u\n", crfD, frA_addr, frB_addr);
+      break;
+   default:
+      vex_printf("dis_fp_cmp(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+
+
+/*
+  Floating Point Rounding/Conversion Instructions
+*/
+static Bool dis_fp_round ( UInt theInstr )
+{
+   /* X-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar b16to20  = ifieldRegA(theInstr);
+   UChar frD_addr = ifieldRegDS(theInstr);
+   UChar frB_addr = ifieldRegB(theInstr);
+   UInt  opc2     = ifieldOPClo10(theInstr);
+   UChar flag_rC  = ifieldBIT0(theInstr);
+
+   IRTemp  frD     = newTemp(Ity_F64);
+   IRTemp  frB     = newTemp(Ity_F64);
+   IRTemp  r_tmp32 = newTemp(Ity_I32);
+   IRTemp  r_tmp64 = newTemp(Ity_I64);
+   IRExpr* rm      = get_IR_roundingmode();
+
+   /* By default, we will examine the results of the operation and set
+      fpscr[FPRF] accordingly. */
+   Bool set_FPRF = True;
+
+   /* By default, if flag_RC is set, we will clear cr1 after the
+      operation.  In reality we should set cr1 to indicate the
+      exception status of the operation, but since we're not
+      simulating exceptions, the exception status will appear to be
+      zero.  Hence cr1 should be cleared if this is a . form insn. */
+   Bool clear_CR1 = True;
+   
+   if (opc1 != 0x3F || b16to20 != 0) {
+      vex_printf("dis_fp_round(ppc)(instr)\n");
+      return False;
+   }
+
+   assign( frB, getFReg(frB_addr));
+
+   switch (opc2) {
+   case 0x00C: // frsp (Float Round to Single, PPC32 p423)
+      DIP("frsp%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+      assign( frD, binop( Iop_RoundF64toF32, rm, mkexpr(frB) ));
+      break;
+      
+   case 0x00E: // fctiw (Float Conv to Int, PPC32 p404)
+      DIP("fctiw%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+      assign( r_tmp32,
+              binop(Iop_F64toI32S, rm, mkexpr(frB)) );
+      assign( frD, unop( Iop_ReinterpI64asF64,
+                         unop( Iop_32Uto64, mkexpr(r_tmp32))));
+      /* FPRF is undefined after fctiw.  Leave unchanged. */
+      set_FPRF = False;
+      break;
+      
+   case 0x00F: // fctiwz (Float Conv to Int, Round to Zero, PPC32 p405)
+      DIP("fctiwz%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+      assign( r_tmp32, 
+              binop(Iop_F64toI32S, mkU32(Irrm_ZERO), mkexpr(frB) ));
+      assign( frD, unop( Iop_ReinterpI64asF64,
+                         unop( Iop_32Uto64, mkexpr(r_tmp32))));
+      /* FPRF is undefined after fctiwz.  Leave unchanged. */
+      set_FPRF = False;
+      break;
+
+   case 0x32E: // fctid (Float Conv to Int DWord, PPC64 p437)
+      DIP("fctid%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+      assign( r_tmp64,
+              binop(Iop_F64toI64S, rm, mkexpr(frB)) );
+      assign( frD, unop( Iop_ReinterpI64asF64, mkexpr(r_tmp64)) );
+      /* FPRF is undefined after fctid.  Leave unchanged. */
+      set_FPRF = False;
+      break;
+
+   case 0x32F: // fctidz (Float Conv to Int DWord, Round to Zero, PPC64 p437)
+      DIP("fctidz%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+      assign( r_tmp64, 
+              binop(Iop_F64toI64S, mkU32(Irrm_ZERO), mkexpr(frB)) );
+      assign( frD, unop( Iop_ReinterpI64asF64, mkexpr(r_tmp64)) );
+      /* FPRF is undefined after fctidz.  Leave unchanged. */
+      set_FPRF = False;
+      break;
+
+   case 0x34E: // fcfid (Float Conv from Int DWord, PPC64 p434)
+      DIP("fcfid%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+      assign( r_tmp64, unop( Iop_ReinterpF64asI64, mkexpr(frB)) );
+      assign( frD, 
+              binop(Iop_I64StoF64, rm, mkexpr(r_tmp64)) );
+      break;
+
+   case 0x188: case 0x1A8: case 0x1C8: case 0x1E8: // frin, friz, frip, frim
+      switch(opc2) {
+      case 0x188: // frin (Floating Round to Integer Nearest)
+         DIP("frin%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+         assign( r_tmp64,
+                 binop(Iop_F64toI64S, mkU32(Irrm_NEAREST), mkexpr(frB)) );
+         break;
+      case 0x1A8: // friz (Floating Round to Integer Toward Zero)
+         DIP("friz%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+         assign( r_tmp64,
+                 binop(Iop_F64toI64S, mkU32(Irrm_ZERO), mkexpr(frB)) );
+         break;
+      case 0x1C8: // frip (Floating Round to Integer Plus)
+         DIP("frip%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+         assign( r_tmp64,
+                 binop(Iop_F64toI64S, mkU32(Irrm_PosINF), mkexpr(frB)) );
+         break;
+      case 0x1E8: // frim (Floating Round to Integer Minus)
+         DIP("frim%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+         assign( r_tmp64,
+                 binop(Iop_F64toI64S, mkU32(Irrm_NegINF), mkexpr(frB)) );
+         break;
+      }
+
+      /* don't use the rounded integer if frB is outside -9e18..9e18 */
+      /* F64 has only log10(2**52) significant digits anyway */
+      /* need to preserve sign of zero */
+      /*   frD = (fabs(frB) > 9e18) ? frB :
+               (sign(frB)) ? -fabs((double)r_tmp64) : (double)r_tmp64  */
+      assign(frD, IRExpr_Mux0X( unop(Iop_32to8,
+                                     binop(Iop_CmpF64,
+                                           IRExpr_Const(IRConst_F64(9e18)),
+                                           unop(Iop_AbsF64, mkexpr(frB)))),
+                                IRExpr_Mux0X(unop(Iop_32to8,
+                                                  binop(Iop_Shr32,
+                                                        unop(Iop_64HIto32,
+                                                             unop(Iop_ReinterpF64asI64,
+                                                                  mkexpr(frB))), mkU8(31))),
+                                             binop(Iop_I64StoF64, mkU32(0), mkexpr(r_tmp64) ),
+                                             unop(Iop_NegF64,
+                                                  unop( Iop_AbsF64,
+                                                        binop(Iop_I64StoF64, mkU32(0),
+                                                              mkexpr(r_tmp64)) )) ),
+                                mkexpr(frB)));
+      break;
+
+   default:
+      vex_printf("dis_fp_round(ppc)(opc2)\n");
+      return False;
+   }
+
+   putFReg( frD_addr, mkexpr(frD) );
+
+   if (set_FPRF) {
+      // XXX XXX XXX FIXME
+      // set FPRF from frD
+   }
+
+   if (flag_rC && clear_CR1) {
+      putCR321( 1, mkU8(0) );
+      putCR0( 1, mkU8(0) );
+   }
+
+   return True;
+}
+
+/*
+  Floating Point Pair Instructions
+*/
+static Bool dis_fp_pair ( UInt theInstr )
+{
+   /* X-Form/DS-Form */
+   UChar  opc1         = ifieldOPC(theInstr);
+   UChar  frT_hi_addr  = ifieldRegDS(theInstr);
+   UChar  frT_lo_addr  = frT_hi_addr + 1;
+   UChar  rA_addr      = ifieldRegA(theInstr);
+   UChar  rB_addr      = ifieldRegB(theInstr);
+   UInt  uimm16        = ifieldUIMM16(theInstr);
+   Int    simm16       = extend_s_16to32(uimm16);
+   UInt   opc2         = ifieldOPClo10(theInstr);
+   IRType ty           = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp EA_hi        = newTemp(ty);
+   IRTemp EA_lo        = newTemp(ty);
+   IRTemp frT_hi       = newTemp(Ity_F64);
+   IRTemp frT_lo       = newTemp(Ity_F64);
+   UChar b0            = ifieldBIT0(theInstr);
+   Bool is_load        = 0;
+
+   if ((frT_hi_addr %2) != 0) {
+      vex_printf("dis_fp_pair(ppc) : odd frT register\n");
+      return False;
+   }
+
+   switch (opc1) {
+   case 0x1F: // register offset
+      switch(opc2) {
+      case 0x317:     // lfdpx (FP Load Double Pair X-form, ISA 2.05  p125)
+         DIP("ldpx fr%u,r%u,r%u\n", frT_hi_addr, rA_addr, rB_addr);
+         is_load = 1;
+         break;
+      case 0x397:     // stfdpx (FP STORE Double Pair X-form, ISA 2.05  p125)
+         DIP("stdpx fr%u,r%u,r%u\n", frT_hi_addr, rA_addr, rB_addr);
+         break;
+      default:
+         vex_printf("dis_fp_pair(ppc) : X-form wrong opc2\n");
+         return False;
+      }
+
+      if (b0 != 0) {
+         vex_printf("dis_fp_pair(ppc)(0x1F,b0)\n");
+         return False;
+      }
+      assign( EA_hi, ea_rAor0_idxd( rA_addr, rB_addr ) );
+      break;
+   case 0x39: // lfdp (FP Load Double Pair DS-form, ISA 2.05  p125)
+      DIP("lfdp fr%u,%d(r%u)\n", frT_hi_addr, simm16, rA_addr);
+      assign( EA_hi, ea_rAor0_simm( rA_addr, simm16  ) );
+      is_load = 1;
+      break;
+   case 0x3d: // stfdp (FP Store Double Pair DS-form, ISA 2.05  p125)
+      DIP("stfdp fr%u,%d(r%u)\n", frT_hi_addr, simm16, rA_addr);
+      assign( EA_hi, ea_rAor0_simm( rA_addr, simm16  ) );
+      break;
+   default:   // immediate offset
+      vex_printf("dis_fp_pair(ppc)(instr)\n");
+      return False;
+   }
+
+   if (mode64)
+      assign( EA_lo, binop(Iop_Add64, mkexpr(EA_hi), mkU64(8)) );
+   else
+      assign( EA_lo, binop(Iop_Add32, mkexpr(EA_hi), mkU32(8)) );
+
+   assign( frT_hi, getFReg(frT_hi_addr) );
+   assign( frT_lo, getFReg(frT_lo_addr) );
+
+   if (is_load) {
+      putFReg( frT_hi_addr, loadBE(Ity_F64, mkexpr(EA_hi)) );
+      putFReg( frT_lo_addr, loadBE(Ity_F64, mkexpr(EA_lo)) );
+   } else {
+      storeBE( mkexpr(EA_hi), mkexpr(frT_hi) );
+      storeBE( mkexpr(EA_lo), mkexpr(frT_lo) );
+   }
+
+   return True;
+}
+
+
+/*
+  Floating Point Move Instructions
+*/
+static Bool dis_fp_move ( UInt theInstr )
+{
+   /* X-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar frD_addr = ifieldRegDS(theInstr);
+   UChar frA_addr = ifieldRegA(theInstr);
+   UChar frB_addr = ifieldRegB(theInstr);
+   UInt  opc2     = ifieldOPClo10(theInstr);
+   UChar flag_rC  = ifieldBIT0(theInstr);
+
+   IRTemp frD = newTemp(Ity_F64);
+   IRTemp frB = newTemp(Ity_F64);
+   IRTemp itmpB = newTemp(Ity_F64);
+   IRTemp frA;
+   IRTemp signA;
+   IRTemp hiD;
+
+   if (opc1 != 0x3F || (frA_addr != 0 && opc2 != 0x008)) {
+      vex_printf("dis_fp_move(ppc)(instr)\n");
+      return False;
+   }
+
+   assign( frB, getFReg(frB_addr));
+
+   switch (opc2) {
+   case 0x008: // fcpsgn (Floating Copy Sign, ISA_V2.05 p126)
+      DIP("fcpsgn%s fr%u,fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frA_addr,
+          frB_addr);
+      signA = newTemp(Ity_I32);
+      hiD = newTemp(Ity_I32);
+      itmpB = newTemp(Ity_I64);
+      frA = newTemp(Ity_F64);
+      assign( frA, getFReg(frA_addr) );
+
+      /* get A's sign bit */
+      assign(signA, binop(Iop_And32,
+                          unop(Iop_64HIto32, unop(Iop_ReinterpF64asI64,
+                                                  mkexpr(frA))),
+                          mkU32(0x80000000)) );
+
+      assign( itmpB, unop(Iop_ReinterpF64asI64, mkexpr(frB)) );
+
+      /* mask off B's sign bit and or in A's sign bit */
+      assign(hiD, binop(Iop_Or32,
+                        binop(Iop_And32,
+                              unop(Iop_64HIto32,
+                                   mkexpr(itmpB)),  /* frB's high 32 bits */
+                              mkU32(0x7fffffff)),
+                        mkexpr(signA)) );
+
+      /* combine hiD/loB into frD */
+      assign( frD, unop(Iop_ReinterpI64asF64,
+                        binop(Iop_32HLto64,
+                              mkexpr(hiD),
+                              unop(Iop_64to32,
+                                   mkexpr(itmpB)))) );   /* frB's low 32 bits */
+      break;
+
+   case 0x028: // fneg (Floating Negate, PPC32 p416)
+      DIP("fneg%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+      assign( frD, unop( Iop_NegF64, mkexpr(frB) ));
+      break;
+      
+   case 0x048: // fmr (Floating Move Register, PPC32 p410)
+      DIP("fmr%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+      assign( frD, mkexpr(frB) );
+      break;
+      
+   case 0x088: // fnabs (Floating Negative Absolute Value, PPC32 p415)
+      DIP("fnabs%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+      assign( frD, unop( Iop_NegF64, unop( Iop_AbsF64, mkexpr(frB) )));
+      break;
+      
+   case 0x108: // fabs (Floating Absolute Value, PPC32 p399)
+      DIP("fabs%s fr%u,fr%u\n", flag_rC ? ".":"", frD_addr, frB_addr);
+      assign( frD, unop( Iop_AbsF64, mkexpr(frB) ));
+      break;
+      
+   default:
+      vex_printf("dis_fp_move(ppc)(opc2)\n");
+      return False;
+   }
+
+   putFReg( frD_addr, mkexpr(frD) );
+
+   /* None of these change FPRF.  cr1 is set in the usual way though,
+      if flag_rC is set. */
+
+   if (flag_rC) {
+      putCR321( 1, mkU8(0) );
+      putCR0( 1, mkU8(0) );
+   }
+
+   return True;
+}
+
+
+
+/*
+  Floating Point Status/Control Register Instructions
+*/
+static Bool dis_fp_scr ( UInt theInstr )
+{
+   /* Many forms - see each switch case */
+   UChar opc1    = ifieldOPC(theInstr);
+   UInt  opc2    = ifieldOPClo10(theInstr);
+   UChar flag_rC = ifieldBIT0(theInstr);
+
+   if (opc1 != 0x3F) {
+      vex_printf("dis_fp_scr(ppc)(instr)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   case 0x026: { // mtfsb1 (Move to FPSCR Bit 1, PPC32 p479)
+      // Bit crbD of the FPSCR is set.
+      UChar crbD    = ifieldRegDS(theInstr);
+      UInt  b11to20 = IFIELD(theInstr, 11, 10);
+
+      if (b11to20 != 0) {
+         vex_printf("dis_fp_scr(ppc)(instr,mtfsb1)\n");
+         return False;
+      }
+      DIP("mtfsb1%s crb%d \n", flag_rC ? ".":"", crbD);
+      putGST_masked( PPC_GST_FPSCR, mkU32(1<<(31-crbD)), 1<<(31-crbD) );
+      break;
+   }
+
+   case 0x040: { // mcrfs (Move to Condition Register from FPSCR, PPC32 p465)
+      UChar   crfD    = toUChar( IFIELD( theInstr, 23, 3 ) );
+      UChar   b21to22 = toUChar( IFIELD( theInstr, 21, 2 ) );
+      UChar   crfS    = toUChar( IFIELD( theInstr, 18, 3 ) );
+      UChar   b11to17 = toUChar( IFIELD( theInstr, 11, 7 ) );
+      IRTemp  tmp     = newTemp(Ity_I32);
+      IRExpr* fpscr_all;
+      if (b21to22 != 0 || b11to17 != 0 || flag_rC != 0) {
+         vex_printf("dis_fp_scr(ppc)(instr,mcrfs)\n");
+         return False;
+      }
+      DIP("mcrfs crf%d,crf%d\n", crfD, crfS);
+      vassert(crfD < 8);
+      vassert(crfS < 8);
+      fpscr_all = getGST_masked( PPC_GST_FPSCR, MASK_FPSCR_RN );
+      assign( tmp, binop(Iop_And32,
+                         binop(Iop_Shr32,fpscr_all,mkU8(4 * (7-crfS))),
+                        mkU32(0xF)) );
+      putGST_field( PPC_GST_CR, mkexpr(tmp), crfD );
+      break;
+   }
+
+   case 0x046: { // mtfsb0 (Move to FPSCR Bit 0, PPC32 p478)
+      // Bit crbD of the FPSCR is cleared.
+      UChar crbD    = ifieldRegDS(theInstr);
+      UInt  b11to20 = IFIELD(theInstr, 11, 10);
+
+      if (b11to20 != 0) {
+         vex_printf("dis_fp_scr(ppc)(instr,mtfsb0)\n");
+         return False;
+      }      
+      DIP("mtfsb0%s crb%d\n", flag_rC ? ".":"", crbD);
+      putGST_masked( PPC_GST_FPSCR, mkU32(0), 1<<(31-crbD) );
+      break;
+   }
+
+   case 0x086: { // mtfsfi (Move to FPSCR Field Immediate, PPC32 p481)
+      UChar crfD    = toUChar( IFIELD( theInstr, 23, 3 ) );
+      UChar b16to22 = toUChar( IFIELD( theInstr, 16, 7 ) );
+      UChar IMM     = toUChar( IFIELD( theInstr, 12, 4 ) );
+      UChar b11     = toUChar( IFIELD( theInstr, 11, 1 ) );
+
+      if (b16to22 != 0 || b11 != 0) {
+         vex_printf("dis_fp_scr(ppc)(instr,mtfsfi)\n");
+         return False;
+      }      
+      DIP("mtfsfi%s crf%d,%d\n", flag_rC ? ".":"", crfD, IMM);
+      putGST_field( PPC_GST_FPSCR, mkU32(IMM), crfD );
+      break;
+   }
+
+   case 0x247: { // mffs (Move from FPSCR, PPC32 p468)
+      UChar   frD_addr  = ifieldRegDS(theInstr);
+      UInt    b11to20   = IFIELD(theInstr, 11, 10);
+      IRExpr* fpscr_all = getGST_masked( PPC_GST_FPSCR, MASK_FPSCR_RN );
+
+      if (b11to20 != 0) {
+         vex_printf("dis_fp_scr(ppc)(instr,mffs)\n");
+         return False;
+      }
+      DIP("mffs%s fr%u\n", flag_rC ? ".":"", frD_addr);
+      putFReg( frD_addr,
+          unop( Iop_ReinterpI64asF64,
+                unop( Iop_32Uto64, fpscr_all )));
+      break;
+   }
+
+   case 0x2C7: { // mtfsf (Move to FPSCR Fields, PPC32 p480)
+      UChar b25      = toUChar( IFIELD(theInstr, 25, 1) );
+      UChar FM       = toUChar( IFIELD(theInstr, 17, 8) );
+      UChar frB_addr = ifieldRegB(theInstr);
+      IRTemp frB   = newTemp(Ity_F64);
+      IRTemp rB_32 = newTemp(Ity_I32);
+      Int i, mask;
+
+      if (b25 == 1) {
+         /* new 64 bit move variant for power 6.  If L field (bit 25) is
+          * a one do a full 64 bit move.  Note, the FPSCR is not really
+          * properly modeled.  This instruciton only changes the value of
+          * the rounding mode.  The HW exception bits do not get set in
+          * the simulator.  1/12/09
+          */
+         DIP("mtfsf%s %d,fr%u (L=1)\n", flag_rC ? ".":"", FM, frB_addr);
+         mask = 0xFF;
+
+      } else {
+         DIP("mtfsf%s %d,fr%u\n", flag_rC ? ".":"", FM, frB_addr);
+         // Build 32bit mask from FM:
+         mask = 0;
+         for (i=0; i<8; i++) {
+            if ((FM & (1<<(7-i))) == 1) {
+               mask |= 0xF << (7-i);
+            }
+         }
+      }
+      assign( frB, getFReg(frB_addr));
+      assign( rB_32, unop( Iop_64to32,
+                           unop( Iop_ReinterpF64asI64, mkexpr(frB) )));
+      putGST_masked( PPC_GST_FPSCR, mkexpr(rB_32), mask );
+      break;
+   }
+
+   default:
+      vex_printf("dis_fp_scr(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+
+
+/*------------------------------------------------------------*/
+/*--- AltiVec Instruction Translation                      ---*/
+/*------------------------------------------------------------*/
+
+/*
+  Altivec Cache Control Instructions (Data Streams)
+*/
+static Bool dis_av_datastream ( UInt theInstr )
+{
+   /* X-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar flag_T   = toUChar( IFIELD( theInstr, 25, 1 ) );
+   UChar flag_A   = flag_T;
+   UChar b23to24  = toUChar( IFIELD( theInstr, 23, 2 ) );
+   UChar STRM     = toUChar( IFIELD( theInstr, 21, 2 ) );
+   UChar rA_addr  = ifieldRegA(theInstr);
+   UChar rB_addr  = ifieldRegB(theInstr);
+   UInt  opc2     = ifieldOPClo10(theInstr);
+   UChar b0       = ifieldBIT0(theInstr);
+
+   if (opc1 != 0x1F || b23to24 != 0 || b0 != 0) {
+      vex_printf("dis_av_datastream(ppc)(instr)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   case 0x156: // dst (Data Stream Touch, AV p115)
+      DIP("dst%s r%u,r%u,%d\n", flag_T ? "t" : "",
+                                rA_addr, rB_addr, STRM);
+      break;
+
+   case 0x176: // dstst (Data Stream Touch for Store, AV p117)
+      DIP("dstst%s r%u,r%u,%d\n", flag_T ? "t" : "",
+                                  rA_addr, rB_addr, STRM);
+      break;
+
+   case 0x336: // dss (Data Stream Stop, AV p114)
+      if (rA_addr != 0 || rB_addr != 0) {
+         vex_printf("dis_av_datastream(ppc)(opc2,dst)\n");
+         return False;
+      }
+      if (flag_A == 0) {
+         DIP("dss %d\n", STRM);
+      } else {
+         DIP("dssall\n");
+      }
+      break;
+
+   default:
+      vex_printf("dis_av_datastream(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+/*
+  AltiVec Processor Control Instructions
+*/
+static Bool dis_av_procctl ( UInt theInstr )
+{
+   /* VX-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UChar vD_addr = ifieldRegDS(theInstr);
+   UChar vA_addr = ifieldRegA(theInstr);
+   UChar vB_addr = ifieldRegB(theInstr);
+   UInt  opc2    = IFIELD( theInstr, 0, 11 );
+
+   if (opc1 != 0x4) {
+      vex_printf("dis_av_procctl(ppc)(instr)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   case 0x604: // mfvscr (Move from VSCR, AV p129)
+      if (vA_addr != 0 || vB_addr != 0) {
+         vex_printf("dis_av_procctl(ppc)(opc2,dst)\n");
+         return False;
+      }
+      DIP("mfvscr v%d\n", vD_addr);
+      putVReg( vD_addr, unop(Iop_32UtoV128, getGST( PPC_GST_VSCR )) ); 
+      break;
+
+   case 0x644: { // mtvscr (Move to VSCR, AV p130)
+      IRTemp vB = newTemp(Ity_V128);
+      if (vD_addr != 0 || vA_addr != 0) {
+         vex_printf("dis_av_procctl(ppc)(opc2,dst)\n");
+         return False;
+      }
+      DIP("mtvscr v%d\n", vB_addr);
+      assign( vB, getVReg(vB_addr));
+      putGST( PPC_GST_VSCR, unop(Iop_V128to32, mkexpr(vB)) ); 
+      break;
+   }
+   default:
+      vex_printf("dis_av_procctl(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+/*
+  AltiVec Load Instructions
+*/
+static Bool dis_av_load ( VexAbiInfo* vbi, UInt theInstr )
+{
+   /* X-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar vD_addr  = ifieldRegDS(theInstr);
+   UChar rA_addr  = ifieldRegA(theInstr);
+   UChar rB_addr  = ifieldRegB(theInstr);
+   UInt  opc2     = ifieldOPClo10(theInstr);
+   UChar b0       = ifieldBIT0(theInstr);
+
+   IRType ty         = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp EA         = newTemp(ty);
+   IRTemp EA_align16 = newTemp(ty);
+
+   if (opc1 != 0x1F || b0 != 0) {
+      vex_printf("dis_av_load(ppc)(instr)\n");
+      return False;
+   }
+
+   assign( EA, ea_rAor0_idxd(rA_addr, rB_addr) );
+   assign( EA_align16, addr_align( mkexpr(EA), 16 ) );
+
+   switch (opc2) {
+
+   case 0x006: { // lvsl (Load Vector for Shift Left, AV p123)
+      IRDirty* d;
+      UInt vD_off = vectorGuestRegOffset(vD_addr);
+      IRExpr** args = mkIRExprVec_3(
+                         mkU32(vD_off), 
+                         binop(Iop_And32, mkNarrowTo32(ty, mkexpr(EA)),
+                                          mkU32(0xF)),
+                         mkU32(0)/*left*/ );
+      if (!mode64) {
+         d = unsafeIRDirty_0_N (
+                        0/*regparms*/, 
+                        "ppc32g_dirtyhelper_LVS",
+                        fnptr_to_fnentry(vbi, &ppc32g_dirtyhelper_LVS),
+                        args );
+      } else {
+         d = unsafeIRDirty_0_N (
+                        0/*regparms*/, 
+                        "ppc64g_dirtyhelper_LVS",
+                        fnptr_to_fnentry(vbi, &ppc64g_dirtyhelper_LVS),
+                        args );
+      }
+      DIP("lvsl v%d,r%u,r%u\n", vD_addr, rA_addr, rB_addr);
+      /* declare guest state effects */
+      d->needsBBP = True;
+      d->nFxState = 1;
+      d->fxState[0].fx     = Ifx_Write;
+      d->fxState[0].offset = vD_off;
+      d->fxState[0].size   = sizeof(U128);
+
+      /* execute the dirty call, side-effecting guest state */
+      stmt( IRStmt_Dirty(d) );
+      break;
+   }
+   case 0x026: { // lvsr (Load Vector for Shift Right, AV p125)
+      IRDirty* d;
+      UInt vD_off = vectorGuestRegOffset(vD_addr);
+      IRExpr** args = mkIRExprVec_3(
+                         mkU32(vD_off), 
+                         binop(Iop_And32, mkNarrowTo32(ty, mkexpr(EA)),
+                                          mkU32(0xF)),
+                         mkU32(1)/*right*/ );
+      if (!mode64) {
+         d = unsafeIRDirty_0_N (
+                        0/*regparms*/, 
+                        "ppc32g_dirtyhelper_LVS",
+                        fnptr_to_fnentry(vbi, &ppc32g_dirtyhelper_LVS),
+                        args );
+      } else {
+         d = unsafeIRDirty_0_N (
+                        0/*regparms*/, 
+                        "ppc64g_dirtyhelper_LVS",
+                        fnptr_to_fnentry(vbi, &ppc64g_dirtyhelper_LVS),
+                        args );
+      }
+      DIP("lvsr v%d,r%u,r%u\n", vD_addr, rA_addr, rB_addr);
+      /* declare guest state effects */
+      d->needsBBP = True;
+      d->nFxState = 1;
+      d->fxState[0].fx     = Ifx_Write;
+      d->fxState[0].offset = vD_off;
+      d->fxState[0].size   = sizeof(U128);
+
+      /* execute the dirty call, side-effecting guest state */
+      stmt( IRStmt_Dirty(d) );
+      break;
+   }
+   case 0x007: // lvebx (Load Vector Element Byte Indexed, AV p119)
+      DIP("lvebx v%d,r%u,r%u\n", vD_addr, rA_addr, rB_addr);
+      /* loads addressed byte into vector[EA[0:3]
+         since all other destination bytes are undefined,
+         can simply load entire vector from 16-aligned EA */
+      putVReg( vD_addr, loadBE(Ity_V128, mkexpr(EA_align16)) );
+      break;
+
+   case 0x027: // lvehx (Load Vector Element Half Word Indexed, AV p121)
+      DIP("lvehx v%d,r%u,r%u\n", vD_addr, rA_addr, rB_addr);
+      /* see note for lvebx */
+      putVReg( vD_addr, loadBE(Ity_V128, mkexpr(EA_align16)) );
+      break;
+
+   case 0x047: // lvewx (Load Vector Element Word Indexed, AV p122)
+      DIP("lvewx v%d,r%u,r%u\n", vD_addr, rA_addr, rB_addr);
+      /* see note for lvebx */
+      putVReg( vD_addr, loadBE(Ity_V128, mkexpr(EA_align16)) );
+      break;
+
+   case 0x067: // lvx (Load Vector Indexed, AV p127)
+      DIP("lvx v%d,r%u,r%u\n", vD_addr, rA_addr, rB_addr);
+      putVReg( vD_addr, loadBE(Ity_V128, mkexpr(EA_align16)) );
+      break;
+
+   case 0x167: // lvxl (Load Vector Indexed LRU, AV p128)
+      DIP("lvxl v%d,r%u,r%u\n", vD_addr, rA_addr, rB_addr);
+      putVReg( vD_addr, loadBE(Ity_V128, mkexpr(EA_align16)) );
+      break;
+
+   default:
+      vex_printf("dis_av_load(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+
+/*
+  AltiVec Store Instructions
+*/
+static Bool dis_av_store ( UInt theInstr )
+{
+   /* X-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar vS_addr  = ifieldRegDS(theInstr);
+   UChar rA_addr  = ifieldRegA(theInstr);
+   UChar rB_addr  = ifieldRegB(theInstr);
+   UInt  opc2     = ifieldOPClo10(theInstr);
+   UChar b0       = ifieldBIT0(theInstr);
+
+   IRType ty           = mode64 ? Ity_I64 : Ity_I32;
+   IRTemp EA           = newTemp(ty);
+   IRTemp addr_aligned = newTemp(ty);
+   IRTemp vS           = newTemp(Ity_V128);
+   IRTemp eb           = newTemp(Ity_I8);
+   IRTemp idx          = newTemp(Ity_I8);
+
+   if (opc1 != 0x1F || b0 != 0) {
+      vex_printf("dis_av_store(ppc)(instr)\n");
+      return False;
+   }
+
+   assign( vS, getVReg(vS_addr));
+   assign( EA, ea_rAor0_idxd(rA_addr, rB_addr) );
+
+   switch (opc2) {
+   case 0x087: { // stvebx (Store Vector Byte Indexed, AV p131)
+      DIP("stvebx v%d,r%u,r%u\n", vS_addr, rA_addr, rB_addr);
+      assign( eb, binop(Iop_And8, mkU8(0xF),
+                        unop(Iop_32to8,
+                             mkNarrowTo32(ty, mkexpr(EA)) )) );
+      assign( idx, binop(Iop_Shl8,
+                         binop(Iop_Sub8, mkU8(15), mkexpr(eb)),
+                         mkU8(3)) );
+      storeBE( mkexpr(EA),
+               unop(Iop_32to8, unop(Iop_V128to32,
+                    binop(Iop_ShrV128, mkexpr(vS), mkexpr(idx)))) );
+      break;
+   }
+   case 0x0A7: { // stvehx (Store Vector Half Word Indexed, AV p132)
+      DIP("stvehx v%d,r%u,r%u\n", vS_addr, rA_addr, rB_addr);
+      assign( addr_aligned, addr_align(mkexpr(EA), 2) );
+      assign( eb, binop(Iop_And8, mkU8(0xF),
+                        mkNarrowTo8(ty, mkexpr(addr_aligned) )) );
+      assign( idx, binop(Iop_Shl8,
+                         binop(Iop_Sub8, mkU8(14), mkexpr(eb)),
+                         mkU8(3)) );
+      storeBE( mkexpr(addr_aligned),
+               unop(Iop_32to16, unop(Iop_V128to32,
+                    binop(Iop_ShrV128, mkexpr(vS), mkexpr(idx)))) );
+      break;
+   }
+   case 0x0C7: { // stvewx (Store Vector Word Indexed, AV p133)
+      DIP("stvewx v%d,r%u,r%u\n", vS_addr, rA_addr, rB_addr);
+      assign( addr_aligned, addr_align(mkexpr(EA), 4) );
+      assign( eb, binop(Iop_And8, mkU8(0xF),
+                        mkNarrowTo8(ty, mkexpr(addr_aligned) )) );
+      assign( idx, binop(Iop_Shl8,
+                         binop(Iop_Sub8, mkU8(12), mkexpr(eb)),
+                         mkU8(3)) );
+      storeBE( mkexpr(addr_aligned),
+               unop(Iop_V128to32,
+                    binop(Iop_ShrV128, mkexpr(vS), mkexpr(idx))) );
+      break;
+   }
+
+   case 0x0E7: // stvx (Store Vector Indexed, AV p134)
+      DIP("stvx v%d,r%u,r%u\n", vS_addr, rA_addr, rB_addr);
+      storeBE( addr_align( mkexpr(EA), 16 ), mkexpr(vS) );
+      break;
+
+   case 0x1E7: // stvxl (Store Vector Indexed LRU, AV p135)
+      DIP("stvxl v%d,r%u,r%u\n", vS_addr, rA_addr, rB_addr);
+      storeBE( addr_align( mkexpr(EA), 16 ), mkexpr(vS) );
+      break;
+
+   default:
+      vex_printf("dis_av_store(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+/*
+  AltiVec Arithmetic Instructions
+*/
+static Bool dis_av_arith ( UInt theInstr )
+{
+   /* VX-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar vD_addr  = ifieldRegDS(theInstr);
+   UChar vA_addr  = ifieldRegA(theInstr);
+   UChar vB_addr  = ifieldRegB(theInstr);
+   UInt  opc2     = IFIELD( theInstr, 0, 11 );
+
+   IRTemp vA = newTemp(Ity_V128);
+   IRTemp vB = newTemp(Ity_V128);
+   IRTemp z3 = newTemp(Ity_I64);
+   IRTemp z2 = newTemp(Ity_I64);
+   IRTemp z1 = newTemp(Ity_I64);
+   IRTemp z0 = newTemp(Ity_I64);
+   IRTemp aEvn, aOdd;
+   IRTemp a15, a14, a13, a12, a11, a10, a9, a8;
+   IRTemp a7, a6, a5, a4, a3, a2, a1, a0;
+   IRTemp b3, b2, b1, b0;
+
+   aEvn = aOdd = IRTemp_INVALID;
+   a15 = a14 = a13 = a12 = a11 = a10 = a9 = a8 = IRTemp_INVALID;
+   a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
+   b3 = b2 = b1 = b0 = IRTemp_INVALID;
+
+   assign( vA, getVReg(vA_addr));
+   assign( vB, getVReg(vB_addr));
+
+   if (opc1 != 0x4) {
+      vex_printf("dis_av_arith(ppc)(opc1 != 0x4)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   /* Add */
+   case 0x180: { // vaddcuw (Add Carryout Unsigned Word, AV p136)
+      DIP("vaddcuw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      /* unsigned_ov(x+y) = (y >u not(x)) */
+      putVReg( vD_addr, binop(Iop_ShrN32x4,
+                              binop(Iop_CmpGT32Ux4, mkexpr(vB),
+                                    unop(Iop_NotV128, mkexpr(vA))),
+                              mkU8(31)) );
+      break;
+   }
+   case 0x000: // vaddubm (Add Unsigned Byte Modulo, AV p141)
+      DIP("vaddubm v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Add8x16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x040: // vadduhm (Add Unsigned Half Word Modulo, AV p143)
+      DIP("vadduhm v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Add16x8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x080: // vadduwm (Add Unsigned Word Modulo, AV p145)
+      DIP("vadduwm v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Add32x4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x200: // vaddubs (Add Unsigned Byte Saturate, AV p142)
+      DIP("vaddubs v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_QAdd8Ux16, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT], perhaps via new primop: Iop_SatOfQAdd8Ux16
+      break;
+
+   case 0x240: // vadduhs (Add Unsigned Half Word Saturate, AV p144)
+      DIP("vadduhs v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_QAdd16Ux8, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      break;
+
+   case 0x280: // vadduws (Add Unsigned Word Saturate, AV p146)
+      DIP("vadduws v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_QAdd32Ux4, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      break;
+
+   case 0x300: // vaddsbs (Add Signed Byte Saturate, AV p138)
+      DIP("vaddsbs v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_QAdd8Sx16, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      break;
+
+   case 0x340: // vaddshs (Add Signed Half Word Saturate, AV p139)
+      DIP("vaddshs v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_QAdd16Sx8, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      break;
+
+   case 0x380: // vaddsws (Add Signed Word Saturate, AV p140)
+      DIP("vaddsws v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_QAdd32Sx4, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      break;
+
+
+   /* Subtract */
+   case 0x580: { // vsubcuw (Subtract Carryout Unsigned Word, AV p260)
+      DIP("vsubcuw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      /* unsigned_ov(x-y) = (y >u x) */
+      putVReg( vD_addr, binop(Iop_ShrN32x4,
+                              unop(Iop_NotV128,
+                                   binop(Iop_CmpGT32Ux4, mkexpr(vB),
+                                         mkexpr(vA))),
+                              mkU8(31)) );
+      break;
+   }     
+   case 0x400: // vsububm (Subtract Unsigned Byte Modulo, AV p265)
+      DIP("vsububm v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Sub8x16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x440: // vsubuhm (Subtract Unsigned Half Word Modulo, AV p267)
+      DIP("vsubuhm v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Sub16x8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x480: // vsubuwm (Subtract Unsigned Word Modulo, AV p269)
+      DIP("vsubuwm v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Sub32x4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x600: // vsububs (Subtract Unsigned Byte Saturate, AV p266)
+      DIP("vsububs v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_QSub8Ux16, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      break;
+
+   case 0x640: // vsubuhs (Subtract Unsigned HWord Saturate, AV p268)
+      DIP("vsubuhs v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_QSub16Ux8, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      break;
+
+   case 0x680: // vsubuws (Subtract Unsigned Word Saturate, AV p270)
+      DIP("vsubuws v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_QSub32Ux4, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      break;
+
+   case 0x700: // vsubsbs (Subtract Signed Byte Saturate, AV p262)
+      DIP("vsubsbs v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_QSub8Sx16, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      break;
+
+   case 0x740: // vsubshs (Subtract Signed Half Word Saturate, AV p263)
+      DIP("vsubshs v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_QSub16Sx8, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      break;
+
+   case 0x780: // vsubsws (Subtract Signed Word Saturate, AV p264)
+      DIP("vsubsws v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_QSub32Sx4, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      break;
+
+
+   /* Maximum */
+   case 0x002: // vmaxub (Maximum Unsigned Byte, AV p182)
+      DIP("vmaxub v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Max8Ux16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x042: // vmaxuh (Maximum Unsigned Half Word, AV p183)
+      DIP("vmaxuh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Max16Ux8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x082: // vmaxuw (Maximum Unsigned Word, AV p184)
+      DIP("vmaxuw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Max32Ux4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x102: // vmaxsb (Maximum Signed Byte, AV p179)
+      DIP("vmaxsb v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Max8Sx16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x142: // vmaxsh (Maximum Signed Half Word, AV p180)
+      DIP("vmaxsh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Max16Sx8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x182: // vmaxsw (Maximum Signed Word, AV p181)
+      DIP("vmaxsw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Max32Sx4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+
+   /* Minimum */
+   case 0x202: // vminub (Minimum Unsigned Byte, AV p191)
+      DIP("vminub v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Min8Ux16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x242: // vminuh (Minimum Unsigned Half Word, AV p192)
+      DIP("vminuh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Min16Ux8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x282: // vminuw (Minimum Unsigned Word, AV p193)
+      DIP("vminuw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Min32Ux4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x302: // vminsb (Minimum Signed Byte, AV p188)
+      DIP("vminsb v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Min8Sx16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x342: // vminsh (Minimum Signed Half Word, AV p189)
+      DIP("vminsh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Min16Sx8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x382: // vminsw (Minimum Signed Word, AV p190)
+      DIP("vminsw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Min32Sx4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+
+   /* Average */
+   case 0x402: // vavgub (Average Unsigned Byte, AV p152)
+      DIP("vavgub v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Avg8Ux16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x442: // vavguh (Average Unsigned Half Word, AV p153)
+      DIP("vavguh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Avg16Ux8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x482: // vavguw (Average Unsigned Word, AV p154)
+      DIP("vavguw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Avg32Ux4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x502: // vavgsb (Average Signed Byte, AV p149)
+      DIP("vavgsb v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Avg8Sx16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x542: // vavgsh (Average Signed Half Word, AV p150)
+      DIP("vavgsh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Avg16Sx8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x582: // vavgsw (Average Signed Word, AV p151)
+      DIP("vavgsw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Avg32Sx4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+
+   /* Multiply */
+   case 0x008: // vmuloub (Multiply Odd Unsigned Byte, AV p213)
+      DIP("vmuloub v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_MullEven8Ux16, mkexpr(vA), mkexpr(vB)));
+      break;
+
+   case 0x048: // vmulouh (Multiply Odd Unsigned Half Word, AV p214)
+      DIP("vmulouh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_MullEven16Ux8, mkexpr(vA), mkexpr(vB)));
+      break;
+
+   case 0x108: // vmulosb (Multiply Odd Signed Byte, AV p211)
+      DIP("vmulosb v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_MullEven8Sx16, mkexpr(vA), mkexpr(vB)));
+      break;
+
+   case 0x148: // vmulosh (Multiply Odd Signed Half Word, AV p212)
+      DIP("vmulosh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_MullEven16Sx8, mkexpr(vA), mkexpr(vB)));
+      break;
+
+   case 0x208: // vmuleub (Multiply Even Unsigned Byte, AV p209)
+      DIP("vmuleub v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, MK_Iop_MullOdd8Ux16( mkexpr(vA), mkexpr(vB) ));
+      break;
+
+   case 0x248: // vmuleuh (Multiply Even Unsigned Half Word, AV p210)
+      DIP("vmuleuh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, MK_Iop_MullOdd16Ux8( mkexpr(vA), mkexpr(vB) ));
+      break;
+
+   case 0x308: // vmulesb (Multiply Even Signed Byte, AV p207)
+      DIP("vmulesb v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, MK_Iop_MullOdd8Sx16( mkexpr(vA), mkexpr(vB) ));
+      break;
+
+   case 0x348: // vmulesh (Multiply Even Signed Half Word, AV p208)
+      DIP("vmulesh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, MK_Iop_MullOdd16Sx8( mkexpr(vA), mkexpr(vB) ));
+      break;
+
+
+   /* Sum Across Partial */
+   case 0x608: { // vsum4ubs (Sum Partial (1/4) UB Saturate, AV p275)
+      IRTemp aEE, aEO, aOE, aOO;
+      aEE = aEO = aOE = aOO = IRTemp_INVALID;
+      DIP("vsum4ubs v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+
+      /* vA: V128_8Ux16 -> 4 x V128_32Ux4, sign-extended */
+      expand8Ux16( mkexpr(vA), &aEvn, &aOdd ); // (15,13...),(14,12...)
+      expand16Ux8( mkexpr(aEvn), &aEE, &aEO ); // (15,11...),(13, 9...)
+      expand16Ux8( mkexpr(aOdd), &aOE, &aOO ); // (14,10...),(12, 8...)
+
+      /* break V128 to 4xI32's, zero-extending to I64's */
+      breakV128to4x64U( mkexpr(aEE), &a15, &a11, &a7, &a3 );
+      breakV128to4x64U( mkexpr(aOE), &a14, &a10, &a6, &a2 );
+      breakV128to4x64U( mkexpr(aEO), &a13, &a9,  &a5, &a1 );
+      breakV128to4x64U( mkexpr(aOO), &a12, &a8,  &a4, &a0 );
+      breakV128to4x64U( mkexpr(vB),  &b3,  &b2,  &b1, &b0 );
+
+      /* add lanes */
+      assign( z3, binop(Iop_Add64, mkexpr(b3),
+                     binop(Iop_Add64,
+                        binop(Iop_Add64, mkexpr(a15), mkexpr(a14)),
+                        binop(Iop_Add64, mkexpr(a13), mkexpr(a12)))) );
+      assign( z2, binop(Iop_Add64, mkexpr(b2),
+                     binop(Iop_Add64,
+                         binop(Iop_Add64, mkexpr(a11), mkexpr(a10)),
+                         binop(Iop_Add64, mkexpr(a9), mkexpr(a8)))) );
+      assign( z1, binop(Iop_Add64, mkexpr(b1),
+                     binop(Iop_Add64,
+                         binop(Iop_Add64, mkexpr(a7), mkexpr(a6)),
+                         binop(Iop_Add64, mkexpr(a5), mkexpr(a4)))) );
+      assign( z0, binop(Iop_Add64, mkexpr(b0),
+                     binop(Iop_Add64,
+                         binop(Iop_Add64, mkexpr(a3), mkexpr(a2)),
+                         binop(Iop_Add64, mkexpr(a1), mkexpr(a0)))) );
+      
+      /* saturate-narrow to 32bit, and combine to V128 */
+      putVReg( vD_addr, mkV128from4x64U( mkexpr(z3), mkexpr(z2),
+                                         mkexpr(z1), mkexpr(z0)) );
+      break;
+   }
+   case 0x708: { // vsum4sbs (Sum Partial (1/4) SB Saturate, AV p273)
+      IRTemp aEE, aEO, aOE, aOO;
+      aEE = aEO = aOE = aOO = IRTemp_INVALID;
+      DIP("vsum4sbs v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+
+      /* vA: V128_8Sx16 -> 4 x V128_32Sx4, sign-extended */
+      expand8Sx16( mkexpr(vA), &aEvn, &aOdd ); // (15,13...),(14,12...)
+      expand16Sx8( mkexpr(aEvn), &aEE, &aEO ); // (15,11...),(13, 9...)
+      expand16Sx8( mkexpr(aOdd), &aOE, &aOO ); // (14,10...),(12, 8...)
+
+      /* break V128 to 4xI32's, sign-extending to I64's */
+      breakV128to4x64S( mkexpr(aEE), &a15, &a11, &a7, &a3 );
+      breakV128to4x64S( mkexpr(aOE), &a14, &a10, &a6, &a2 );
+      breakV128to4x64S( mkexpr(aEO), &a13, &a9,  &a5, &a1 );
+      breakV128to4x64S( mkexpr(aOO), &a12, &a8,  &a4, &a0 );
+      breakV128to4x64S( mkexpr(vB),  &b3,  &b2,  &b1, &b0 );
+
+      /* add lanes */
+      assign( z3, binop(Iop_Add64, mkexpr(b3),
+                     binop(Iop_Add64,
+                        binop(Iop_Add64, mkexpr(a15), mkexpr(a14)),
+                        binop(Iop_Add64, mkexpr(a13), mkexpr(a12)))) );
+      assign( z2, binop(Iop_Add64, mkexpr(b2),
+                     binop(Iop_Add64,
+                        binop(Iop_Add64, mkexpr(a11), mkexpr(a10)),
+                        binop(Iop_Add64, mkexpr(a9), mkexpr(a8)))) );
+      assign( z1, binop(Iop_Add64, mkexpr(b1),
+                     binop(Iop_Add64,
+                        binop(Iop_Add64, mkexpr(a7), mkexpr(a6)),
+                        binop(Iop_Add64, mkexpr(a5), mkexpr(a4)))) );
+      assign( z0, binop(Iop_Add64, mkexpr(b0),
+                     binop(Iop_Add64,
+                        binop(Iop_Add64, mkexpr(a3), mkexpr(a2)),
+                        binop(Iop_Add64, mkexpr(a1), mkexpr(a0)))) );
+      
+      /* saturate-narrow to 32bit, and combine to V128 */
+      putVReg( vD_addr, mkV128from4x64S( mkexpr(z3), mkexpr(z2),
+                                         mkexpr(z1), mkexpr(z0)) );
+      break;
+   }
+   case 0x648: { // vsum4shs (Sum Partial (1/4) SHW Saturate, AV p274)
+      DIP("vsum4shs v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+
+      /* vA: V128_16Sx8 -> 2 x V128_32Sx4, sign-extended */
+      expand16Sx8( mkexpr(vA), &aEvn, &aOdd ); // (7,5...),(6,4...)
+
+      /* break V128 to 4xI32's, sign-extending to I64's */
+      breakV128to4x64S( mkexpr(aEvn), &a7, &a5, &a3, &a1 );
+      breakV128to4x64S( mkexpr(aOdd), &a6, &a4, &a2, &a0 );
+      breakV128to4x64S( mkexpr(vB),   &b3, &b2, &b1, &b0 );
+
+      /* add lanes */
+      assign( z3, binop(Iop_Add64, mkexpr(b3),
+                        binop(Iop_Add64, mkexpr(a7), mkexpr(a6))));
+      assign( z2, binop(Iop_Add64, mkexpr(b2),
+                        binop(Iop_Add64, mkexpr(a5), mkexpr(a4))));
+      assign( z1, binop(Iop_Add64, mkexpr(b1),
+                        binop(Iop_Add64, mkexpr(a3), mkexpr(a2))));
+      assign( z0, binop(Iop_Add64, mkexpr(b0),
+                        binop(Iop_Add64, mkexpr(a1), mkexpr(a0))));
+
+      /* saturate-narrow to 32bit, and combine to V128 */
+      putVReg( vD_addr, mkV128from4x64S( mkexpr(z3), mkexpr(z2),
+                                         mkexpr(z1), mkexpr(z0)) );
+      break;
+   }
+   case 0x688: { // vsum2sws (Sum Partial (1/2) SW Saturate, AV p272)
+      DIP("vsum2sws v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+
+      /* break V128 to 4xI32's, sign-extending to I64's */
+      breakV128to4x64S( mkexpr(vA), &a3, &a2, &a1, &a0 );
+      breakV128to4x64S( mkexpr(vB), &b3, &b2, &b1, &b0 );
+
+      /* add lanes */
+      assign( z2, binop(Iop_Add64, mkexpr(b2),
+                        binop(Iop_Add64, mkexpr(a3), mkexpr(a2))) );
+      assign( z0, binop(Iop_Add64, mkexpr(b0),
+                        binop(Iop_Add64, mkexpr(a1), mkexpr(a0))) );
+
+      /* saturate-narrow to 32bit, and combine to V128 */
+      putVReg( vD_addr, mkV128from4x64S( mkU64(0), mkexpr(z2),
+                                         mkU64(0), mkexpr(z0)) );
+      break;
+   }
+   case 0x788: { // vsumsws  (Sum SW Saturate, AV p271)
+      DIP("vsumsws v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+
+      /* break V128 to 4xI32's, sign-extending to I64's */
+      breakV128to4x64S( mkexpr(vA), &a3, &a2, &a1, &a0 );
+      breakV128to4x64S( mkexpr(vB), &b3, &b2, &b1, &b0 );
+
+      /* add lanes */
+      assign( z0, binop(Iop_Add64, mkexpr(b0),
+                     binop(Iop_Add64,
+                        binop(Iop_Add64, mkexpr(a3), mkexpr(a2)),
+                        binop(Iop_Add64, mkexpr(a1), mkexpr(a0)))) );
+
+      /* saturate-narrow to 32bit, and combine to V128 */
+      putVReg( vD_addr, mkV128from4x64S( mkU64(0), mkU64(0),
+                                         mkU64(0), mkexpr(z0)) );
+      break;
+   }
+   default:
+      vex_printf("dis_av_arith(ppc)(opc2=0x%x)\n", opc2);
+      return False;
+   }
+   return True;
+}
+
+/*
+  AltiVec Logic Instructions
+*/
+static Bool dis_av_logic ( UInt theInstr )
+{
+   /* VX-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UChar vD_addr = ifieldRegDS(theInstr);
+   UChar vA_addr = ifieldRegA(theInstr);
+   UChar vB_addr = ifieldRegB(theInstr);
+   UInt  opc2    = IFIELD( theInstr, 0, 11 );
+
+   IRTemp vA = newTemp(Ity_V128);
+   IRTemp vB = newTemp(Ity_V128);
+   assign( vA, getVReg(vA_addr));
+   assign( vB, getVReg(vB_addr));
+
+   if (opc1 != 0x4) {
+      vex_printf("dis_av_logic(ppc)(opc1 != 0x4)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   case 0x404: // vand (And, AV p147)
+      DIP("vand v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_AndV128, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x444: // vandc (And, AV p148)
+      DIP("vandc v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_AndV128, mkexpr(vA),
+                              unop(Iop_NotV128, mkexpr(vB))) );
+      break;
+
+   case 0x484: // vor (Or, AV p217)
+      DIP("vor v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_OrV128, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x4C4: // vxor (Xor, AV p282)
+      DIP("vxor v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_XorV128, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x504: // vnor (Nor, AV p216)
+      DIP("vnor v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+         unop(Iop_NotV128, binop(Iop_OrV128, mkexpr(vA), mkexpr(vB))) );
+      break;
+
+   default:
+      vex_printf("dis_av_logic(ppc)(opc2=0x%x)\n", opc2);
+      return False;
+   }
+   return True;
+}
+
+/*
+  AltiVec Compare Instructions
+*/
+static Bool dis_av_cmp ( UInt theInstr )
+{
+   /* VXR-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar vD_addr  = ifieldRegDS(theInstr);
+   UChar vA_addr  = ifieldRegA(theInstr);
+   UChar vB_addr  = ifieldRegB(theInstr);
+   UChar flag_rC  = ifieldBIT10(theInstr);
+   UInt  opc2     = IFIELD( theInstr, 0, 10 );
+
+   IRTemp vA = newTemp(Ity_V128);
+   IRTemp vB = newTemp(Ity_V128);
+   IRTemp vD = newTemp(Ity_V128);
+   assign( vA, getVReg(vA_addr));
+   assign( vB, getVReg(vB_addr));
+
+   if (opc1 != 0x4) {
+      vex_printf("dis_av_cmp(ppc)(instr)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   case 0x006: // vcmpequb (Compare Equal-to Unsigned B, AV p160)
+      DIP("vcmpequb%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                      vD_addr, vA_addr, vB_addr);
+      assign( vD, binop(Iop_CmpEQ8x16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x046: // vcmpequh (Compare Equal-to Unsigned HW, AV p161)
+      DIP("vcmpequh%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                      vD_addr, vA_addr, vB_addr);
+      assign( vD, binop(Iop_CmpEQ16x8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x086: // vcmpequw (Compare Equal-to Unsigned W, AV p162)
+      DIP("vcmpequw%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                      vD_addr, vA_addr, vB_addr);
+      assign( vD, binop(Iop_CmpEQ32x4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x206: // vcmpgtub (Compare Greater-than Unsigned B, AV p168)
+      DIP("vcmpgtub%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                      vD_addr, vA_addr, vB_addr);
+      assign( vD, binop(Iop_CmpGT8Ux16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x246: // vcmpgtuh (Compare Greater-than Unsigned HW, AV p169)
+      DIP("vcmpgtuh%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                      vD_addr, vA_addr, vB_addr);
+      assign( vD, binop(Iop_CmpGT16Ux8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x286: // vcmpgtuw (Compare Greater-than Unsigned W, AV p170)
+      DIP("vcmpgtuw%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                       vD_addr, vA_addr, vB_addr);
+      assign( vD, binop(Iop_CmpGT32Ux4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x306: // vcmpgtsb (Compare Greater-than Signed B, AV p165)
+      DIP("vcmpgtsb%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                       vD_addr, vA_addr, vB_addr);
+      assign( vD, binop(Iop_CmpGT8Sx16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x346: // vcmpgtsh (Compare Greater-than Signed HW, AV p166)
+      DIP("vcmpgtsh%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                      vD_addr, vA_addr, vB_addr);
+      assign( vD, binop(Iop_CmpGT16Sx8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x386: // vcmpgtsw (Compare Greater-than Signed W, AV p167)
+      DIP("vcmpgtsw%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                      vD_addr, vA_addr, vB_addr);
+      assign( vD, binop(Iop_CmpGT32Sx4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   default:
+      vex_printf("dis_av_cmp(ppc)(opc2)\n");
+      return False;
+   }
+
+   putVReg( vD_addr, mkexpr(vD) );
+
+   if (flag_rC) {
+      set_AV_CR6( mkexpr(vD), True );
+   }
+   return True;
+}
+
+/*
+  AltiVec Multiply-Sum Instructions
+*/
+static Bool dis_av_multarith ( UInt theInstr )
+{
+   /* VA-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar vD_addr  = ifieldRegDS(theInstr);
+   UChar vA_addr  = ifieldRegA(theInstr);
+   UChar vB_addr  = ifieldRegB(theInstr);
+   UChar vC_addr  = ifieldRegC(theInstr);
+   UChar opc2     = toUChar( IFIELD( theInstr, 0, 6 ) );
+
+   IRTemp vA    = newTemp(Ity_V128);
+   IRTemp vB    = newTemp(Ity_V128);
+   IRTemp vC    = newTemp(Ity_V128);
+   IRTemp zeros = newTemp(Ity_V128);
+   IRTemp aLo   = newTemp(Ity_V128);
+   IRTemp bLo   = newTemp(Ity_V128);
+   IRTemp cLo   = newTemp(Ity_V128);
+   IRTemp zLo   = newTemp(Ity_V128);
+   IRTemp aHi   = newTemp(Ity_V128);
+   IRTemp bHi   = newTemp(Ity_V128);
+   IRTemp cHi   = newTemp(Ity_V128);
+   IRTemp zHi   = newTemp(Ity_V128);
+   IRTemp abEvn = newTemp(Ity_V128);
+   IRTemp abOdd = newTemp(Ity_V128);
+   IRTemp z3    = newTemp(Ity_I64);
+   IRTemp z2    = newTemp(Ity_I64);
+   IRTemp z1    = newTemp(Ity_I64);
+   IRTemp z0    = newTemp(Ity_I64);
+   IRTemp ab7, ab6, ab5, ab4, ab3, ab2, ab1, ab0;
+   IRTemp c3, c2, c1, c0;
+
+   ab7 = ab6 = ab5 = ab4 = ab3 = ab2 = ab1 = ab0 = IRTemp_INVALID;
+   c3 = c2 = c1 = c0 = IRTemp_INVALID;
+
+   assign( vA, getVReg(vA_addr));
+   assign( vB, getVReg(vB_addr));
+   assign( vC, getVReg(vC_addr));
+   assign( zeros, unop(Iop_Dup32x4, mkU32(0)) );
+
+   if (opc1 != 0x4) {
+      vex_printf("dis_av_multarith(ppc)(instr)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   /* Multiply-Add */
+   case 0x20: { // vmhaddshs (Mult Hi, Add Signed HW Saturate, AV p185)
+      IRTemp cSigns = newTemp(Ity_V128);
+      DIP("vmhaddshs v%d,v%d,v%d,v%d\n",
+          vD_addr, vA_addr, vB_addr, vC_addr);
+      assign(cSigns, binop(Iop_CmpGT16Sx8, mkexpr(zeros), mkexpr(vC)));
+      assign(aLo, binop(Iop_InterleaveLO16x8, mkexpr(zeros), mkexpr(vA)));
+      assign(bLo, binop(Iop_InterleaveLO16x8, mkexpr(zeros), mkexpr(vB)));
+      assign(cLo, binop(Iop_InterleaveLO16x8, mkexpr(cSigns),mkexpr(vC)));
+      assign(aHi, binop(Iop_InterleaveHI16x8, mkexpr(zeros), mkexpr(vA)));
+      assign(bHi, binop(Iop_InterleaveHI16x8, mkexpr(zeros), mkexpr(vB)));
+      assign(cHi, binop(Iop_InterleaveHI16x8, mkexpr(cSigns),mkexpr(vC)));
+
+      assign( zLo, binop(Iop_Add32x4, mkexpr(cLo),
+                         binop(Iop_SarN32x4,
+                               binop(Iop_MullEven16Sx8,
+                                     mkexpr(aLo), mkexpr(bLo)),
+                               mkU8(15))) );
+
+      assign( zHi, binop(Iop_Add32x4, mkexpr(cHi),
+                         binop(Iop_SarN32x4,
+                               binop(Iop_MullEven16Sx8,
+                                     mkexpr(aHi), mkexpr(bHi)),
+                               mkU8(15))) );
+
+      putVReg( vD_addr,
+               binop(Iop_QNarrow32Sx4, mkexpr(zHi), mkexpr(zLo)) );
+      break;
+   }
+   case 0x21: { // vmhraddshs (Mult High Round, Add Signed HW Saturate, AV p186)
+      IRTemp zKonst = newTemp(Ity_V128);
+      IRTemp cSigns = newTemp(Ity_V128);
+      DIP("vmhraddshs v%d,v%d,v%d,v%d\n",
+          vD_addr, vA_addr, vB_addr, vC_addr);
+      assign(cSigns, binop(Iop_CmpGT16Sx8, mkexpr(zeros), mkexpr(vC)) );
+      assign(aLo, binop(Iop_InterleaveLO16x8, mkexpr(zeros), mkexpr(vA)));
+      assign(bLo, binop(Iop_InterleaveLO16x8, mkexpr(zeros), mkexpr(vB)));
+      assign(cLo, binop(Iop_InterleaveLO16x8, mkexpr(cSigns),mkexpr(vC)));
+      assign(aHi, binop(Iop_InterleaveHI16x8, mkexpr(zeros), mkexpr(vA)));
+      assign(bHi, binop(Iop_InterleaveHI16x8, mkexpr(zeros), mkexpr(vB)));
+      assign(cHi, binop(Iop_InterleaveHI16x8, mkexpr(cSigns),mkexpr(vC)));
+
+      /* shifting our const avoids store/load version of Dup */
+      assign( zKonst, binop(Iop_ShlN32x4, unop(Iop_Dup32x4, mkU32(0x1)),
+                            mkU8(14)) );
+
+      assign( zLo, binop(Iop_Add32x4, mkexpr(cLo),
+                         binop(Iop_SarN32x4,
+                               binop(Iop_Add32x4, mkexpr(zKonst),
+                                     binop(Iop_MullEven16Sx8,
+                                           mkexpr(aLo), mkexpr(bLo))),
+                               mkU8(15))) );
+
+      assign( zHi, binop(Iop_Add32x4, mkexpr(cHi),
+                         binop(Iop_SarN32x4,
+                               binop(Iop_Add32x4, mkexpr(zKonst),
+                                     binop(Iop_MullEven16Sx8,
+                                           mkexpr(aHi), mkexpr(bHi))),
+                               mkU8(15))) );
+
+      putVReg( vD_addr, binop(Iop_QNarrow32Sx4, mkexpr(zHi), mkexpr(zLo)) );
+      break;
+   }
+   case 0x22: { // vmladduhm (Mult Low, Add Unsigned HW Modulo, AV p194)
+      DIP("vmladduhm v%d,v%d,v%d,v%d\n",
+          vD_addr, vA_addr, vB_addr, vC_addr);
+      assign(aLo, binop(Iop_InterleaveLO16x8, mkexpr(zeros), mkexpr(vA)));
+      assign(bLo, binop(Iop_InterleaveLO16x8, mkexpr(zeros), mkexpr(vB)));
+      assign(cLo, binop(Iop_InterleaveLO16x8, mkexpr(zeros), mkexpr(vC)));
+      assign(aHi, binop(Iop_InterleaveHI16x8, mkexpr(zeros), mkexpr(vA)));
+      assign(bHi, binop(Iop_InterleaveHI16x8, mkexpr(zeros), mkexpr(vB)));
+      assign(cHi, binop(Iop_InterleaveHI16x8, mkexpr(zeros), mkexpr(vC)));
+      assign(zLo, binop(Iop_Add32x4,
+                     binop(Iop_MullEven16Ux8, mkexpr(aLo), mkexpr(bLo)),
+                     mkexpr(cLo)) );
+      assign(zHi, binop(Iop_Add32x4,
+                     binop(Iop_MullEven16Ux8, mkexpr(aHi), mkexpr(bHi)),
+                     mkexpr(cHi)));
+      putVReg(vD_addr, binop(Iop_Narrow32x4, mkexpr(zHi), mkexpr(zLo)));
+      break;
+   }
+
+
+   /* Multiply-Sum */
+   case 0x24: { // vmsumubm (Multiply Sum Unsigned B Modulo, AV p204)
+      IRTemp abEE, abEO, abOE, abOO;
+      abEE = abEO = abOE = abOO = IRTemp_INVALID;
+      DIP("vmsumubm v%d,v%d,v%d,v%d\n",
+          vD_addr, vA_addr, vB_addr, vC_addr);
+
+      /* multiply vA,vB (unsigned, widening) */
+      assign( abEvn, MK_Iop_MullOdd8Ux16( mkexpr(vA), mkexpr(vB) ));
+      assign( abOdd, binop(Iop_MullEven8Ux16, mkexpr(vA), mkexpr(vB)) );
+      
+      /* evn,odd: V128_16Ux8 -> 2 x V128_32Ux4, zero-extended */
+      expand16Ux8( mkexpr(abEvn), &abEE, &abEO );
+      expand16Ux8( mkexpr(abOdd), &abOE, &abOO );
+      
+      putVReg( vD_addr,
+         binop(Iop_Add32x4, mkexpr(vC),
+               binop(Iop_Add32x4,
+                     binop(Iop_Add32x4, mkexpr(abEE), mkexpr(abEO)),
+                     binop(Iop_Add32x4, mkexpr(abOE), mkexpr(abOO)))) );
+      break;
+   }
+   case 0x25: { // vmsummbm (Multiply Sum Mixed-Sign B Modulo, AV p201)
+      IRTemp aEvn, aOdd, bEvn, bOdd;
+      IRTemp abEE = newTemp(Ity_V128);
+      IRTemp abEO = newTemp(Ity_V128);
+      IRTemp abOE = newTemp(Ity_V128);
+      IRTemp abOO = newTemp(Ity_V128);
+      aEvn = aOdd = bEvn = bOdd = IRTemp_INVALID;
+      DIP("vmsummbm v%d,v%d,v%d,v%d\n",
+          vD_addr, vA_addr, vB_addr, vC_addr);
+
+      /* sign-extend vA, zero-extend vB, for mixed-sign multiply
+         (separating out adjacent lanes to different vectors) */
+      expand8Sx16( mkexpr(vA), &aEvn, &aOdd );
+      expand8Ux16( mkexpr(vB), &bEvn, &bOdd );
+
+      /* multiply vA, vB, again separating adjacent lanes */
+      assign( abEE, MK_Iop_MullOdd16Sx8( mkexpr(aEvn), mkexpr(bEvn) ));
+      assign( abEO, binop(Iop_MullEven16Sx8, mkexpr(aEvn), mkexpr(bEvn)) );
+      assign( abOE, MK_Iop_MullOdd16Sx8( mkexpr(aOdd), mkexpr(bOdd) ));
+      assign( abOO, binop(Iop_MullEven16Sx8, mkexpr(aOdd), mkexpr(bOdd)) );
+
+      /* add results together, + vC */
+      putVReg( vD_addr,
+         binop(Iop_QAdd32Sx4, mkexpr(vC),
+               binop(Iop_QAdd32Sx4,
+                     binop(Iop_QAdd32Sx4, mkexpr(abEE), mkexpr(abEO)),
+                     binop(Iop_QAdd32Sx4, mkexpr(abOE), mkexpr(abOO)))) );
+      break;
+   }
+   case 0x26: { // vmsumuhm (Multiply Sum Unsigned HW Modulo, AV p205)
+      DIP("vmsumuhm v%d,v%d,v%d,v%d\n",
+          vD_addr, vA_addr, vB_addr, vC_addr);
+      assign( abEvn, MK_Iop_MullOdd16Ux8( mkexpr(vA), mkexpr(vB) ));
+      assign( abOdd, binop(Iop_MullEven16Ux8, mkexpr(vA), mkexpr(vB)) );
+      putVReg( vD_addr,
+         binop(Iop_Add32x4, mkexpr(vC),
+               binop(Iop_Add32x4, mkexpr(abEvn), mkexpr(abOdd))) );
+      break;
+   }
+   case 0x27: { // vmsumuhs (Multiply Sum Unsigned HW Saturate, AV p206)
+      DIP("vmsumuhs v%d,v%d,v%d,v%d\n",
+          vD_addr, vA_addr, vB_addr, vC_addr);
+      /* widening multiply, separating lanes */
+      assign( abEvn, MK_Iop_MullOdd16Ux8(mkexpr(vA), mkexpr(vB) ));
+      assign( abOdd, binop(Iop_MullEven16Ux8, mkexpr(vA), mkexpr(vB)) );
+
+      /* break V128 to 4xI32's, zero-extending to I64's */
+      breakV128to4x64U( mkexpr(abEvn), &ab7, &ab5, &ab3, &ab1 );
+      breakV128to4x64U( mkexpr(abOdd), &ab6, &ab4, &ab2, &ab0 );
+      breakV128to4x64U( mkexpr(vC),    &c3,  &c2,  &c1,  &c0  );
+
+      /* add lanes */
+      assign( z3, binop(Iop_Add64, mkexpr(c3),
+                        binop(Iop_Add64, mkexpr(ab7), mkexpr(ab6))));
+      assign( z2, binop(Iop_Add64, mkexpr(c2),
+                        binop(Iop_Add64, mkexpr(ab5), mkexpr(ab4))));
+      assign( z1, binop(Iop_Add64, mkexpr(c1),
+                        binop(Iop_Add64, mkexpr(ab3), mkexpr(ab2))));
+      assign( z0, binop(Iop_Add64, mkexpr(c0),
+                        binop(Iop_Add64, mkexpr(ab1), mkexpr(ab0))));
+
+      /* saturate-narrow to 32bit, and combine to V128 */
+      putVReg( vD_addr, mkV128from4x64U( mkexpr(z3), mkexpr(z2),
+                                         mkexpr(z1), mkexpr(z0)) );
+
+      break;
+   }
+   case 0x28: { // vmsumshm (Multiply Sum Signed HW Modulo, AV p202)
+      DIP("vmsumshm v%d,v%d,v%d,v%d\n",
+          vD_addr, vA_addr, vB_addr, vC_addr);
+      assign( abEvn, MK_Iop_MullOdd16Sx8( mkexpr(vA), mkexpr(vB) ));
+      assign( abOdd, binop(Iop_MullEven16Sx8, mkexpr(vA), mkexpr(vB)) );
+      putVReg( vD_addr,
+         binop(Iop_Add32x4, mkexpr(vC),
+               binop(Iop_Add32x4, mkexpr(abOdd), mkexpr(abEvn))) );
+      break;
+   }
+   case 0x29: { // vmsumshs (Multiply Sum Signed HW Saturate, AV p203)
+      DIP("vmsumshs v%d,v%d,v%d,v%d\n",
+          vD_addr, vA_addr, vB_addr, vC_addr);
+      /* widening multiply, separating lanes */
+      assign( abEvn, MK_Iop_MullOdd16Sx8( mkexpr(vA), mkexpr(vB) ));
+      assign( abOdd, binop(Iop_MullEven16Sx8, mkexpr(vA), mkexpr(vB)) );
+
+      /* break V128 to 4xI32's, sign-extending to I64's */
+      breakV128to4x64S( mkexpr(abEvn), &ab7, &ab5, &ab3, &ab1 );
+      breakV128to4x64S( mkexpr(abOdd), &ab6, &ab4, &ab2, &ab0 );
+      breakV128to4x64S( mkexpr(vC),    &c3,  &c2,  &c1,  &c0  );
+
+      /* add lanes */
+      assign( z3, binop(Iop_Add64, mkexpr(c3),
+                        binop(Iop_Add64, mkexpr(ab7), mkexpr(ab6))));
+      assign( z2, binop(Iop_Add64, mkexpr(c2),
+                        binop(Iop_Add64, mkexpr(ab5), mkexpr(ab4))));
+      assign( z1, binop(Iop_Add64, mkexpr(c1),
+                        binop(Iop_Add64, mkexpr(ab3), mkexpr(ab2))));
+      assign( z0, binop(Iop_Add64, mkexpr(c0),
+                        binop(Iop_Add64, mkexpr(ab1), mkexpr(ab0))));
+
+      /* saturate-narrow to 32bit, and combine to V128 */
+      putVReg( vD_addr, mkV128from4x64S( mkexpr(z3), mkexpr(z2),
+                                         mkexpr(z1), mkexpr(z0)) );
+      break;
+   }
+   default:
+      vex_printf("dis_av_multarith(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+/*
+  AltiVec Shift/Rotate Instructions
+*/
+static Bool dis_av_shift ( UInt theInstr )
+{
+   /* VX-Form */
+   UChar opc1    = ifieldOPC(theInstr);
+   UChar vD_addr = ifieldRegDS(theInstr);
+   UChar vA_addr = ifieldRegA(theInstr);
+   UChar vB_addr = ifieldRegB(theInstr);
+   UInt  opc2    = IFIELD( theInstr, 0, 11 );
+
+   IRTemp vA = newTemp(Ity_V128);
+   IRTemp vB = newTemp(Ity_V128);
+   assign( vA, getVReg(vA_addr));
+   assign( vB, getVReg(vB_addr));
+
+   if (opc1 != 0x4){
+      vex_printf("dis_av_shift(ppc)(instr)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   /* Rotate */
+   case 0x004: // vrlb (Rotate Left Integer B, AV p234)
+      DIP("vrlb v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Rol8x16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x044: // vrlh (Rotate Left Integer HW, AV p235)
+      DIP("vrlh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Rol16x8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x084: // vrlw (Rotate Left Integer W, AV p236)
+      DIP("vrlw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Rol32x4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+
+   /* Shift Left */
+   case 0x104: // vslb (Shift Left Integer B, AV p240)
+      DIP("vslb v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Shl8x16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x144: // vslh (Shift Left Integer HW, AV p242)
+      DIP("vslh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Shl16x8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x184: // vslw (Shift Left Integer W, AV p244)
+      DIP("vslw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Shl32x4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x1C4: { // vsl (Shift Left, AV p239)
+      IRTemp sh = newTemp(Ity_I8);
+      DIP("vsl v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      assign( sh, binop(Iop_And8, mkU8(0x7),
+                        unop(Iop_32to8,
+                             unop(Iop_V128to32, mkexpr(vB)))) );
+      putVReg( vD_addr,
+               binop(Iop_ShlV128, mkexpr(vA), mkexpr(sh)) );
+      break;
+   }
+   case 0x40C: { // vslo (Shift Left by Octet, AV p243)
+      IRTemp sh = newTemp(Ity_I8);
+      DIP("vslo v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      assign( sh, binop(Iop_And8, mkU8(0x78),
+                        unop(Iop_32to8,
+                             unop(Iop_V128to32, mkexpr(vB)))) );
+      putVReg( vD_addr,
+               binop(Iop_ShlV128, mkexpr(vA), mkexpr(sh)) );
+      break;
+   }
+
+
+   /* Shift Right */
+   case 0x204: // vsrb (Shift Right B, AV p256)
+      DIP("vsrb v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Shr8x16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x244: // vsrh (Shift Right HW, AV p257)
+      DIP("vsrh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Shr16x8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x284: // vsrw (Shift Right W, AV p259)
+      DIP("vsrw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Shr32x4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x2C4: { // vsr (Shift Right, AV p251)
+      IRTemp sh = newTemp(Ity_I8);
+      DIP("vsr v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      assign( sh, binop(Iop_And8, mkU8(0x7),
+                        unop(Iop_32to8,
+                             unop(Iop_V128to32, mkexpr(vB)))) );
+      putVReg( vD_addr,
+               binop(Iop_ShrV128, mkexpr(vA), mkexpr(sh)) );
+      break;
+   }
+   case 0x304: // vsrab (Shift Right Alg B, AV p253)
+      DIP("vsrab v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Sar8x16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x344: // vsrah (Shift Right Alg HW, AV p254)
+      DIP("vsrah v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Sar16x8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x384: // vsraw (Shift Right Alg W, AV p255)
+      DIP("vsraw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Sar32x4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x44C: { // vsro (Shift Right by Octet, AV p258)
+      IRTemp sh = newTemp(Ity_I8);
+      DIP("vsro v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      assign( sh, binop(Iop_And8, mkU8(0x78),
+                        unop(Iop_32to8,
+                             unop(Iop_V128to32, mkexpr(vB)))) );
+      putVReg( vD_addr,
+               binop(Iop_ShrV128, mkexpr(vA), mkexpr(sh)) );
+      break;
+   }
+
+   default:
+      vex_printf("dis_av_shift(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+/*
+  AltiVec Permute Instructions
+*/
+static Bool dis_av_permute ( UInt theInstr )
+{
+   /* VA-Form, VX-Form */
+   UChar opc1      = ifieldOPC(theInstr);
+   UChar vD_addr   = ifieldRegDS(theInstr);
+   UChar vA_addr   = ifieldRegA(theInstr);
+   UChar UIMM_5    = vA_addr;
+   UChar vB_addr   = ifieldRegB(theInstr);
+   UChar vC_addr   = ifieldRegC(theInstr);
+   UChar b10       = ifieldBIT10(theInstr);
+   UChar SHB_uimm4 = toUChar( IFIELD( theInstr, 6, 4 ) );
+   UInt  opc2      = toUChar( IFIELD( theInstr, 0, 6 ) );
+
+   UChar SIMM_8 = extend_s_5to8(UIMM_5);
+
+   IRTemp vA = newTemp(Ity_V128);
+   IRTemp vB = newTemp(Ity_V128);
+   IRTemp vC = newTemp(Ity_V128);
+   assign( vA, getVReg(vA_addr));
+   assign( vB, getVReg(vB_addr));
+   assign( vC, getVReg(vC_addr));
+
+   if (opc1 != 0x4) {
+      vex_printf("dis_av_permute(ppc)(instr)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   case 0x2A: // vsel (Conditional Select, AV p238)
+      DIP("vsel v%d,v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr, vC_addr);
+      /* vD = (vA & ~vC) | (vB & vC) */
+      putVReg( vD_addr, binop(Iop_OrV128,
+         binop(Iop_AndV128, mkexpr(vA), unop(Iop_NotV128, mkexpr(vC))),
+         binop(Iop_AndV128, mkexpr(vB), mkexpr(vC))) );
+      return True;
+     
+   case 0x2B: { // vperm (Permute, AV p218)
+      /* limited to two args for IR, so have to play games... */
+      IRTemp a_perm  = newTemp(Ity_V128);
+      IRTemp b_perm  = newTemp(Ity_V128);
+      IRTemp mask    = newTemp(Ity_V128);
+      IRTemp vC_andF = newTemp(Ity_V128);
+      DIP("vperm v%d,v%d,v%d,v%d\n",
+          vD_addr, vA_addr, vB_addr, vC_addr);
+      /* Limit the Perm8x16 steering values to 0 .. 15 as that is what
+         IR specifies, and also to hide irrelevant bits from
+         memcheck */
+      assign( vC_andF,
+              binop(Iop_AndV128, mkexpr(vC),
+                                 unop(Iop_Dup8x16, mkU8(0xF))) );
+      assign( a_perm,
+              binop(Iop_Perm8x16, mkexpr(vA), mkexpr(vC_andF)) );
+      assign( b_perm,
+              binop(Iop_Perm8x16, mkexpr(vB), mkexpr(vC_andF)) );
+      // mask[i8] = (vC[i8]_4 == 1) ? 0xFF : 0x0
+      assign( mask, binop(Iop_SarN8x16,
+                          binop(Iop_ShlN8x16, mkexpr(vC), mkU8(3)),
+                          mkU8(7)) );
+      // dst = (a & ~mask) | (b & mask)
+      putVReg( vD_addr, binop(Iop_OrV128,
+                              binop(Iop_AndV128, mkexpr(a_perm),
+                                    unop(Iop_NotV128, mkexpr(mask))),
+                              binop(Iop_AndV128, mkexpr(b_perm),
+                                    mkexpr(mask))) );
+      return True;
+   }
+   case 0x2C: // vsldoi (Shift Left Double by Octet Imm, AV p241)
+      if (b10 != 0) {
+         vex_printf("dis_av_permute(ppc)(vsldoi)\n");
+         return False;
+      }
+      DIP("vsldoi v%d,v%d,v%d,%d\n",
+          vD_addr, vA_addr, vB_addr, SHB_uimm4);
+      if (SHB_uimm4 == 0)
+         putVReg( vD_addr, mkexpr(vA) );
+      else
+         putVReg( vD_addr,
+            binop(Iop_OrV128,
+                  binop(Iop_ShlV128, mkexpr(vA), mkU8(SHB_uimm4*8)),
+                  binop(Iop_ShrV128, mkexpr(vB), mkU8((16-SHB_uimm4)*8))) );
+      return True;
+
+   default:
+     break; // Fall through...
+   }
+
+   opc2 = IFIELD( theInstr, 0, 11 );
+   switch (opc2) {
+
+   /* Merge */
+   case 0x00C: // vmrghb (Merge High B, AV p195)
+      DIP("vmrghb v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_InterleaveHI8x16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x04C: // vmrghh (Merge High HW, AV p196)
+      DIP("vmrghh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_InterleaveHI16x8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x08C: // vmrghw (Merge High W, AV p197)
+      DIP("vmrghw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_InterleaveHI32x4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x10C: // vmrglb (Merge Low B, AV p198)
+      DIP("vmrglb v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_InterleaveLO8x16, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x14C: // vmrglh (Merge Low HW, AV p199)
+      DIP("vmrglh v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_InterleaveLO16x8, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x18C: // vmrglw (Merge Low W, AV p200)
+      DIP("vmrglw v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_InterleaveLO32x4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+
+   /* Splat */
+   case 0x20C: { // vspltb (Splat Byte, AV p245)
+      /* vD = Dup8x16( vB[UIMM_5] ) */
+      UChar sh_uimm = (15 - (UIMM_5 & 15)) * 8;
+      DIP("vspltb v%d,v%d,%d\n", vD_addr, vB_addr, UIMM_5);
+      putVReg( vD_addr, unop(Iop_Dup8x16,
+           unop(Iop_32to8, unop(Iop_V128to32, 
+                binop(Iop_ShrV128, mkexpr(vB), mkU8(sh_uimm))))) );
+      break;
+   }
+   case 0x24C: { // vsplth (Splat Half Word, AV p246)
+      UChar sh_uimm = (7 - (UIMM_5 & 7)) * 16;
+      DIP("vsplth v%d,v%d,%d\n", vD_addr, vB_addr, UIMM_5);
+      putVReg( vD_addr, unop(Iop_Dup16x8,
+           unop(Iop_32to16, unop(Iop_V128to32, 
+                binop(Iop_ShrV128, mkexpr(vB), mkU8(sh_uimm))))) );
+      break;
+   }
+   case 0x28C: { // vspltw (Splat Word, AV p250)
+      /* vD = Dup32x4( vB[UIMM_5] ) */
+      UChar sh_uimm = (3 - (UIMM_5 & 3)) * 32;
+      DIP("vspltw v%d,v%d,%d\n", vD_addr, vB_addr, UIMM_5);
+      putVReg( vD_addr, unop(Iop_Dup32x4,
+         unop(Iop_V128to32,
+              binop(Iop_ShrV128, mkexpr(vB), mkU8(sh_uimm)))) );
+      break;
+   }
+   case 0x30C: // vspltisb (Splat Immediate Signed B, AV p247)
+      DIP("vspltisb v%d,%d\n", vD_addr, (Char)SIMM_8);
+      putVReg( vD_addr, unop(Iop_Dup8x16, mkU8(SIMM_8)) );
+      break;
+
+   case 0x34C: // vspltish (Splat Immediate Signed HW, AV p248)
+      DIP("vspltish v%d,%d\n", vD_addr, (Char)SIMM_8);
+      putVReg( vD_addr,
+               unop(Iop_Dup16x8, mkU16(extend_s_8to32(SIMM_8))) );
+      break;
+
+   case 0x38C: // vspltisw (Splat Immediate Signed W, AV p249)
+      DIP("vspltisw v%d,%d\n", vD_addr, (Char)SIMM_8);
+      putVReg( vD_addr,
+               unop(Iop_Dup32x4, mkU32(extend_s_8to32(SIMM_8))) );
+      break;
+
+   default:
+      vex_printf("dis_av_permute(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+/*
+  AltiVec Pack/Unpack Instructions
+*/
+static Bool dis_av_pack ( UInt theInstr )
+{
+   /* VX-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar vD_addr  = ifieldRegDS(theInstr);
+   UChar vA_addr  = ifieldRegA(theInstr);
+   UChar vB_addr  = ifieldRegB(theInstr);
+   UInt  opc2     = IFIELD( theInstr, 0, 11 );
+
+   IRTemp signs = IRTemp_INVALID;
+   IRTemp zeros = IRTemp_INVALID;
+   IRTemp vA    = newTemp(Ity_V128);
+   IRTemp vB    = newTemp(Ity_V128);
+   assign( vA, getVReg(vA_addr));
+   assign( vB, getVReg(vB_addr));
+
+   if (opc1 != 0x4) {
+      vex_printf("dis_av_pack(ppc)(instr)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   /* Packing */
+   case 0x00E: // vpkuhum (Pack Unsigned HW Unsigned Modulo, AV p224)
+      DIP("vpkuhum v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Narrow16x8, mkexpr(vA), mkexpr(vB)) );
+      return True;
+
+   case 0x04E: // vpkuwum (Pack Unsigned W Unsigned Modulo, AV p226)
+      DIP("vpkuwum v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Narrow32x4, mkexpr(vA), mkexpr(vB)) );
+      return True;
+
+   case 0x08E: // vpkuhus (Pack Unsigned HW Unsigned Saturate, AV p225)
+      DIP("vpkuhus v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_QNarrow16Ux8, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      return True;
+
+   case 0x0CE: // vpkuwus (Pack Unsigned W Unsigned Saturate, AV p227)
+      DIP("vpkuwus v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_QNarrow32Ux4, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      return True;
+
+   case 0x10E: { // vpkshus (Pack Signed HW Unsigned Saturate, AV p221)
+      // This insn does a signed->unsigned saturating conversion.
+      // Conversion done here, then uses unsigned->unsigned vpk insn:
+      //  => UnsignedSaturatingNarrow( x & ~ (x >>s 15) )
+      IRTemp vA_tmp = newTemp(Ity_V128);
+      IRTemp vB_tmp = newTemp(Ity_V128);
+      DIP("vpkshus v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      assign( vA_tmp, binop(Iop_AndV128, mkexpr(vA),
+                            unop(Iop_NotV128,
+                                 binop(Iop_SarN16x8,
+                                       mkexpr(vA), mkU8(15)))) );
+      assign( vB_tmp, binop(Iop_AndV128, mkexpr(vB),
+                            unop(Iop_NotV128,
+                                 binop(Iop_SarN16x8,
+                                       mkexpr(vB), mkU8(15)))) );
+      putVReg( vD_addr, binop(Iop_QNarrow16Ux8,
+                              mkexpr(vA_tmp), mkexpr(vB_tmp)) );
+      // TODO: set VSCR[SAT]
+      return True;
+   }
+   case 0x14E: { // vpkswus (Pack Signed W Unsigned Saturate, AV p223)
+      // This insn does a signed->unsigned saturating conversion.
+      // Conversion done here, then uses unsigned->unsigned vpk insn:
+      //  => UnsignedSaturatingNarrow( x & ~ (x >>s 31) )
+      IRTemp vA_tmp = newTemp(Ity_V128);
+      IRTemp vB_tmp = newTemp(Ity_V128);
+      DIP("vpkswus v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      assign( vA_tmp, binop(Iop_AndV128, mkexpr(vA),
+                            unop(Iop_NotV128,
+                                 binop(Iop_SarN32x4,
+                                       mkexpr(vA), mkU8(31)))) );
+      assign( vB_tmp, binop(Iop_AndV128, mkexpr(vB),
+                            unop(Iop_NotV128,
+                                 binop(Iop_SarN32x4,
+                                       mkexpr(vB), mkU8(31)))) );
+      putVReg( vD_addr, binop(Iop_QNarrow32Ux4,
+                              mkexpr(vA_tmp), mkexpr(vB_tmp)) );
+      // TODO: set VSCR[SAT]
+      return True;
+   }
+   case 0x18E: // vpkshss (Pack Signed HW Signed Saturate, AV p220)
+      DIP("vpkshss v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_QNarrow16Sx8, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      return True;
+
+   case 0x1CE: // vpkswss (Pack Signed W Signed Saturate, AV p222)
+      DIP("vpkswss v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_QNarrow32Sx4, mkexpr(vA), mkexpr(vB)) );
+      // TODO: set VSCR[SAT]
+      return True;
+
+   case 0x30E: { // vpkpx (Pack Pixel, AV p219)
+      /* CAB: Worth a new primop? */
+      /* Using shifts to compact pixel elements, then packing them */
+      IRTemp a1 = newTemp(Ity_V128);
+      IRTemp a2 = newTemp(Ity_V128);
+      IRTemp a3 = newTemp(Ity_V128);
+      IRTemp a_tmp = newTemp(Ity_V128);
+      IRTemp b1 = newTemp(Ity_V128);
+      IRTemp b2 = newTemp(Ity_V128);
+      IRTemp b3 = newTemp(Ity_V128);
+      IRTemp b_tmp = newTemp(Ity_V128);
+      DIP("vpkpx v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      assign( a1, binop(Iop_ShlN16x8,
+                        binop(Iop_ShrN32x4, mkexpr(vA), mkU8(19)),
+                        mkU8(10)) );
+      assign( a2, binop(Iop_ShlN16x8, 
+                        binop(Iop_ShrN16x8, mkexpr(vA), mkU8(11)),
+                        mkU8(5)) );
+      assign( a3,  binop(Iop_ShrN16x8, 
+                         binop(Iop_ShlN16x8, mkexpr(vA), mkU8(8)),
+                         mkU8(11)) );
+      assign( a_tmp, binop(Iop_OrV128, mkexpr(a1),
+                           binop(Iop_OrV128, mkexpr(a2), mkexpr(a3))) );
+
+      assign( b1, binop(Iop_ShlN16x8,
+                        binop(Iop_ShrN32x4, mkexpr(vB), mkU8(19)),
+                        mkU8(10)) );
+      assign( b2, binop(Iop_ShlN16x8, 
+                        binop(Iop_ShrN16x8, mkexpr(vB), mkU8(11)),
+                        mkU8(5)) );
+      assign( b3,  binop(Iop_ShrN16x8, 
+                         binop(Iop_ShlN16x8, mkexpr(vB), mkU8(8)),
+                         mkU8(11)) );
+      assign( b_tmp, binop(Iop_OrV128, mkexpr(b1),
+                           binop(Iop_OrV128, mkexpr(b2), mkexpr(b3))) );
+
+      putVReg( vD_addr, binop(Iop_Narrow32x4,
+                              mkexpr(a_tmp), mkexpr(b_tmp)) );
+      return True;
+   }
+
+   default:
+      break; // Fall through...
+   }
+
+
+   if (vA_addr != 0) {
+      vex_printf("dis_av_pack(ppc)(vA_addr)\n");
+      return False;
+   }
+
+   signs = newTemp(Ity_V128);
+   zeros = newTemp(Ity_V128);
+   assign( zeros, unop(Iop_Dup32x4, mkU32(0)) );
+
+   switch (opc2) {
+   /* Unpacking */
+   case 0x20E: { // vupkhsb (Unpack High Signed B, AV p277)
+      DIP("vupkhsb v%d,v%d\n", vD_addr, vB_addr);
+      assign( signs, binop(Iop_CmpGT8Sx16, mkexpr(zeros), mkexpr(vB)) );
+      putVReg( vD_addr,
+               binop(Iop_InterleaveHI8x16, mkexpr(signs), mkexpr(vB)) );
+      break;
+   }
+   case 0x24E: { // vupkhsh (Unpack High Signed HW, AV p278)
+      DIP("vupkhsh v%d,v%d\n", vD_addr, vB_addr);
+      assign( signs, binop(Iop_CmpGT16Sx8, mkexpr(zeros), mkexpr(vB)) );
+      putVReg( vD_addr,
+               binop(Iop_InterleaveHI16x8, mkexpr(signs), mkexpr(vB)) );
+      break;
+   }
+   case 0x28E: { // vupklsb (Unpack Low Signed B, AV p280)
+      DIP("vupklsb v%d,v%d\n", vD_addr, vB_addr);
+      assign( signs, binop(Iop_CmpGT8Sx16, mkexpr(zeros), mkexpr(vB)) );
+      putVReg( vD_addr,
+               binop(Iop_InterleaveLO8x16, mkexpr(signs), mkexpr(vB)) );
+      break;
+   }
+   case 0x2CE: { // vupklsh (Unpack Low Signed HW, AV p281)
+      DIP("vupklsh v%d,v%d\n", vD_addr, vB_addr);
+      assign( signs, binop(Iop_CmpGT16Sx8, mkexpr(zeros), mkexpr(vB)) );
+      putVReg( vD_addr,
+               binop(Iop_InterleaveLO16x8, mkexpr(signs), mkexpr(vB)) );
+      break;
+   }
+   case 0x34E: { // vupkhpx (Unpack High Pixel16, AV p276)
+      /* CAB: Worth a new primop? */
+      /* Using shifts to isolate pixel elements, then expanding them */
+      IRTemp z0  = newTemp(Ity_V128);
+      IRTemp z1  = newTemp(Ity_V128);
+      IRTemp z01 = newTemp(Ity_V128);
+      IRTemp z2  = newTemp(Ity_V128);
+      IRTemp z3  = newTemp(Ity_V128);
+      IRTemp z23 = newTemp(Ity_V128);
+      DIP("vupkhpx v%d,v%d\n", vD_addr, vB_addr);
+      assign( z0,  binop(Iop_ShlN16x8,
+                         binop(Iop_SarN16x8, mkexpr(vB), mkU8(15)),
+                         mkU8(8)) );
+      assign( z1,  binop(Iop_ShrN16x8, 
+                         binop(Iop_ShlN16x8, mkexpr(vB), mkU8(1)),
+                         mkU8(11)) );
+      assign( z01, binop(Iop_InterleaveHI16x8, mkexpr(zeros),
+                         binop(Iop_OrV128, mkexpr(z0), mkexpr(z1))) );
+      assign( z2,  binop(Iop_ShrN16x8,
+                         binop(Iop_ShlN16x8, 
+                               binop(Iop_ShrN16x8, mkexpr(vB), mkU8(5)),
+                               mkU8(11)),
+                         mkU8(3)) );
+      assign( z3,  binop(Iop_ShrN16x8, 
+                         binop(Iop_ShlN16x8, mkexpr(vB), mkU8(11)),
+                         mkU8(11)) );
+      assign( z23, binop(Iop_InterleaveHI16x8, mkexpr(zeros),
+                         binop(Iop_OrV128, mkexpr(z2), mkexpr(z3))) );
+      putVReg( vD_addr,
+               binop(Iop_OrV128,
+                     binop(Iop_ShlN32x4, mkexpr(z01), mkU8(16)),
+                     mkexpr(z23)) );
+      break;
+   }
+   case 0x3CE: { // vupklpx (Unpack Low Pixel16, AV p279)
+      /* identical to vupkhpx, except interleaving LO */
+      IRTemp z0  = newTemp(Ity_V128);
+      IRTemp z1  = newTemp(Ity_V128);
+      IRTemp z01 = newTemp(Ity_V128);
+      IRTemp z2  = newTemp(Ity_V128);
+      IRTemp z3  = newTemp(Ity_V128);
+      IRTemp z23 = newTemp(Ity_V128);
+      DIP("vupklpx v%d,v%d\n", vD_addr, vB_addr);
+      assign( z0,  binop(Iop_ShlN16x8,
+                         binop(Iop_SarN16x8, mkexpr(vB), mkU8(15)),
+                         mkU8(8)) );
+      assign( z1,  binop(Iop_ShrN16x8, 
+                         binop(Iop_ShlN16x8, mkexpr(vB), mkU8(1)),
+                         mkU8(11)) );
+      assign( z01, binop(Iop_InterleaveLO16x8, mkexpr(zeros),
+                         binop(Iop_OrV128, mkexpr(z0), mkexpr(z1))) );
+      assign( z2,  binop(Iop_ShrN16x8,
+                         binop(Iop_ShlN16x8, 
+                               binop(Iop_ShrN16x8, mkexpr(vB), mkU8(5)),
+                               mkU8(11)),
+                         mkU8(3)) );
+      assign( z3,  binop(Iop_ShrN16x8, 
+                         binop(Iop_ShlN16x8, mkexpr(vB), mkU8(11)),
+                         mkU8(11)) );
+      assign( z23, binop(Iop_InterleaveLO16x8, mkexpr(zeros),
+                         binop(Iop_OrV128, mkexpr(z2), mkexpr(z3))) );
+      putVReg( vD_addr,
+               binop(Iop_OrV128,
+                     binop(Iop_ShlN32x4, mkexpr(z01), mkU8(16)),
+                     mkexpr(z23)) );
+      break;
+   }
+   default:
+      vex_printf("dis_av_pack(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+
+/*
+  AltiVec Floating Point Arithmetic Instructions
+*/
+static Bool dis_av_fp_arith ( UInt theInstr )
+{
+   /* VA-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar vD_addr  = ifieldRegDS(theInstr);
+   UChar vA_addr  = ifieldRegA(theInstr);
+   UChar vB_addr  = ifieldRegB(theInstr);
+   UChar vC_addr  = ifieldRegC(theInstr);
+   UInt  opc2=0;
+
+   IRTemp vA = newTemp(Ity_V128);
+   IRTemp vB = newTemp(Ity_V128);
+   IRTemp vC = newTemp(Ity_V128);
+   assign( vA, getVReg(vA_addr));
+   assign( vB, getVReg(vB_addr));
+   assign( vC, getVReg(vC_addr));
+
+   if (opc1 != 0x4) {
+      vex_printf("dis_av_fp_arith(ppc)(instr)\n");
+      return False;
+   }
+
+   opc2 = IFIELD( theInstr, 0, 6 );
+   switch (opc2) {
+   case 0x2E: // vmaddfp (Multiply Add FP, AV p177)
+      DIP("vmaddfp v%d,v%d,v%d,v%d\n",
+          vD_addr, vA_addr, vC_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_Add32Fx4, mkexpr(vB),
+                     binop(Iop_Mul32Fx4, mkexpr(vA), mkexpr(vC))) );
+      return True;
+
+   case 0x2F: { // vnmsubfp (Negative Multiply-Subtract FP, AV p215)
+      DIP("vnmsubfp v%d,v%d,v%d,v%d\n",
+          vD_addr, vA_addr, vC_addr, vB_addr);
+      putVReg( vD_addr,
+               binop(Iop_Sub32Fx4,
+                     mkexpr(vB),
+                     binop(Iop_Mul32Fx4, mkexpr(vA), mkexpr(vC))) );
+      return True;
+   }
+
+   default:
+     break; // Fall through...
+   }
+
+   opc2 = IFIELD( theInstr, 0, 11 );
+   switch (opc2) {
+   case 0x00A: // vaddfp (Add FP, AV p137)
+      DIP("vaddfp v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Add32Fx4, mkexpr(vA), mkexpr(vB)) );
+      return True;
+
+  case 0x04A: // vsubfp (Subtract FP, AV p261)
+      DIP("vsubfp v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Sub32Fx4, mkexpr(vA), mkexpr(vB)) );
+      return True;
+
+   case 0x40A: // vmaxfp (Maximum FP, AV p178)
+      DIP("vmaxfp v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Max32Fx4, mkexpr(vA), mkexpr(vB)) );
+      return True;
+
+   case 0x44A: // vminfp (Minimum FP, AV p187)
+      DIP("vminfp v%d,v%d,v%d\n", vD_addr, vA_addr, vB_addr);
+      putVReg( vD_addr, binop(Iop_Min32Fx4, mkexpr(vA), mkexpr(vB)) );
+      return True;
+
+   default:
+      break; // Fall through...
+   }
+
+
+   if (vA_addr != 0) {
+      vex_printf("dis_av_fp_arith(ppc)(vA_addr)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   case 0x10A: // vrefp (Reciprocal Esimate FP, AV p228)
+      DIP("vrefp v%d,v%d\n", vD_addr, vB_addr);
+      putVReg( vD_addr, unop(Iop_Recip32Fx4, mkexpr(vB)) );
+      return True;
+
+   case 0x14A: // vrsqrtefp (Reciprocal Sqrt Estimate FP, AV p237)
+      DIP("vrsqrtefp v%d,v%d\n", vD_addr, vB_addr);
+      putVReg( vD_addr, unop(Iop_RSqrt32Fx4, mkexpr(vB)) );
+      return True;
+
+   case 0x18A: // vexptefp (2 Raised to the Exp Est FP, AV p173)
+      DIP("vexptefp v%d,v%d\n", vD_addr, vB_addr);
+      DIP(" => not implemented\n");
+      return False;
+
+   case 0x1CA: // vlogefp (Log2 Estimate FP, AV p175)
+      DIP("vlogefp v%d,v%d\n", vD_addr, vB_addr);
+      DIP(" => not implemented\n");
+      return False;
+
+   default:
+      vex_printf("dis_av_fp_arith(ppc)(opc2=0x%x)\n",opc2);
+      return False;
+   }
+   return True;
+}
+
+/*
+  AltiVec Floating Point Compare Instructions
+*/
+static Bool dis_av_fp_cmp ( UInt theInstr )
+{
+   /* VXR-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar vD_addr  = ifieldRegDS(theInstr);
+   UChar vA_addr  = ifieldRegA(theInstr);
+   UChar vB_addr  = ifieldRegB(theInstr);
+   UChar flag_rC  = ifieldBIT10(theInstr);
+   UInt  opc2     = IFIELD( theInstr, 0, 10 );
+
+   Bool cmp_bounds = False;
+
+   IRTemp vA = newTemp(Ity_V128);
+   IRTemp vB = newTemp(Ity_V128);
+   IRTemp vD = newTemp(Ity_V128);
+   assign( vA, getVReg(vA_addr));
+   assign( vB, getVReg(vB_addr));
+
+   if (opc1 != 0x4) {
+      vex_printf("dis_av_fp_cmp(ppc)(instr)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   case 0x0C6: // vcmpeqfp (Compare Equal-to FP, AV p159)
+      DIP("vcmpeqfp%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                      vD_addr, vA_addr, vB_addr);
+      assign( vD, binop(Iop_CmpEQ32Fx4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x1C6: // vcmpgefp (Compare Greater-than-or-Equal-to, AV p163)
+      DIP("vcmpgefp%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                      vD_addr, vA_addr, vB_addr);
+      assign( vD, binop(Iop_CmpGE32Fx4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x2C6: // vcmpgtfp (Compare Greater-than FP, AV p164)
+      DIP("vcmpgtfp%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                      vD_addr, vA_addr, vB_addr);
+      assign( vD, binop(Iop_CmpGT32Fx4, mkexpr(vA), mkexpr(vB)) );
+      break;
+
+   case 0x3C6: { // vcmpbfp (Compare Bounds FP, AV p157)
+      IRTemp gt      = newTemp(Ity_V128);
+      IRTemp lt      = newTemp(Ity_V128);
+      IRTemp zeros   = newTemp(Ity_V128);
+      DIP("vcmpbfp%s v%d,v%d,v%d\n", (flag_rC ? ".":""),
+                                     vD_addr, vA_addr, vB_addr);
+      cmp_bounds = True;
+      assign( zeros,   unop(Iop_Dup32x4, mkU32(0)) );
+
+      /* Note: making use of fact that the ppc backend for compare insns
+         return zero'd lanes if either of the corresponding arg lanes is
+         a nan.
+
+         Perhaps better to have an irop Iop_isNan32Fx4, but then we'd
+         need this for the other compares too (vcmpeqfp etc)...
+         Better still, tighten down the spec for compare irops.
+       */
+      assign( gt, unop(Iop_NotV128,
+                       binop(Iop_CmpLE32Fx4, mkexpr(vA), mkexpr(vB))) );
+      assign( lt, unop(Iop_NotV128,
+                       binop(Iop_CmpGE32Fx4, mkexpr(vA),
+                             binop(Iop_Sub32Fx4, mkexpr(zeros),
+                                                 mkexpr(vB)))) );
+
+      // finally, just shift gt,lt to correct position
+      assign( vD, binop(Iop_ShlN32x4,
+                        binop(Iop_OrV128,
+                              binop(Iop_AndV128, mkexpr(gt),
+                                    unop(Iop_Dup32x4, mkU32(0x2))),
+                              binop(Iop_AndV128, mkexpr(lt),
+                                    unop(Iop_Dup32x4, mkU32(0x1)))),
+                        mkU8(30)) );
+      break;
+   }
+
+   default:
+      vex_printf("dis_av_fp_cmp(ppc)(opc2)\n");
+      return False;
+   }
+
+   putVReg( vD_addr, mkexpr(vD) );
+
+   if (flag_rC) {
+      set_AV_CR6( mkexpr(vD), !cmp_bounds );
+   }
+   return True;
+}
+
+/*
+  AltiVec Floating Point Convert/Round Instructions
+*/
+static Bool dis_av_fp_convert ( UInt theInstr )
+{
+   /* VX-Form */
+   UChar opc1     = ifieldOPC(theInstr);
+   UChar vD_addr  = ifieldRegDS(theInstr);
+   UChar UIMM_5   = ifieldRegA(theInstr);
+   UChar vB_addr  = ifieldRegB(theInstr);
+   UInt  opc2     = IFIELD( theInstr, 0, 11 );
+
+   IRTemp vB        = newTemp(Ity_V128);
+   IRTemp vScale    = newTemp(Ity_V128);
+   IRTemp vInvScale = newTemp(Ity_V128);
+
+   float scale, inv_scale;
+
+   assign( vB, getVReg(vB_addr));
+
+   /* scale = 2^UIMM, cast to float, reinterpreted as uint */
+   scale = (float)( (unsigned int) 1<<UIMM_5 );
+   assign( vScale, unop(Iop_Dup32x4, mkU32( float_to_bits(scale) )) );
+   inv_scale = 1/scale;
+   assign( vInvScale,
+           unop(Iop_Dup32x4, mkU32( float_to_bits(inv_scale) )) );
+
+   if (opc1 != 0x4) {
+      vex_printf("dis_av_fp_convert(ppc)(instr)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   case 0x30A: // vcfux (Convert from Unsigned Fixed-Point W, AV p156)
+      DIP("vcfux v%d,v%d,%d\n", vD_addr, vB_addr, UIMM_5);
+      putVReg( vD_addr, binop(Iop_Mul32Fx4,
+                              unop(Iop_I32UtoFx4, mkexpr(vB)),
+                              mkexpr(vInvScale)) );
+      return True;
+
+   case 0x34A: // vcfsx (Convert from Signed Fixed-Point W, AV p155)
+      DIP("vcfsx v%d,v%d,%d\n", vD_addr, vB_addr, UIMM_5);
+
+      putVReg( vD_addr, binop(Iop_Mul32Fx4,
+                              unop(Iop_I32StoFx4, mkexpr(vB)),
+                              mkexpr(vInvScale)) );
+      return True;
+
+   case 0x38A: // vctuxs (Convert to Unsigned Fixed-Point W Saturate, AV p172)
+      DIP("vctuxs v%d,v%d,%d\n", vD_addr, vB_addr, UIMM_5);
+      putVReg( vD_addr,
+               unop(Iop_QFtoI32Ux4_RZ, 
+                    binop(Iop_Mul32Fx4, mkexpr(vB), mkexpr(vScale))) );
+      return True;
+
+   case 0x3CA: // vctsxs (Convert to Signed Fixed-Point W Saturate, AV p171)
+      DIP("vctsxs v%d,v%d,%d\n", vD_addr, vB_addr, UIMM_5);
+      putVReg( vD_addr, 
+               unop(Iop_QFtoI32Sx4_RZ, 
+                     binop(Iop_Mul32Fx4, mkexpr(vB), mkexpr(vScale))) );
+      return True;
+
+   default:
+     break;    // Fall through...
+   }
+
+   if (UIMM_5 != 0) {
+      vex_printf("dis_av_fp_convert(ppc)(UIMM_5)\n");
+      return False;
+   }
+
+   switch (opc2) {
+   case 0x20A: // vrfin (Round to FP Integer Nearest, AV p231)
+      DIP("vrfin v%d,v%d\n", vD_addr, vB_addr);
+      putVReg( vD_addr, unop(Iop_RoundF32x4_RN, mkexpr(vB)) );
+      break;
+
+   case 0x24A: // vrfiz (Round to FP Integer toward zero, AV p233)
+      DIP("vrfiz v%d,v%d\n", vD_addr, vB_addr);
+      putVReg( vD_addr, unop(Iop_RoundF32x4_RZ, mkexpr(vB)) );
+      break;
+
+   case 0x28A: // vrfip (Round to FP Integer toward +inf, AV p232)
+      DIP("vrfip v%d,v%d\n", vD_addr, vB_addr);
+      putVReg( vD_addr, unop(Iop_RoundF32x4_RP, mkexpr(vB)) );
+      break;
+
+   case 0x2CA: // vrfim (Round to FP Integer toward -inf, AV p230)
+      DIP("vrfim v%d,v%d\n", vD_addr, vB_addr);
+      putVReg( vD_addr, unop(Iop_RoundF32x4_RM, mkexpr(vB)) );
+      break;
+
+   default:
+      vex_printf("dis_av_fp_convert(ppc)(opc2)\n");
+      return False;
+   }
+   return True;
+}
+
+
+
+
+
+/*------------------------------------------------------------*/
+/*--- Disassemble a single instruction                     ---*/
+/*------------------------------------------------------------*/
+
+/* Disassemble a single instruction into IR.  The instruction
+   is located in host memory at &guest_code[delta]. */
+
+static   
+DisResult disInstr_PPC_WRK ( 
+             Bool         put_IP,
+             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
+             Bool         resteerCisOk,
+             void*        callback_opaque,
+             Long         delta64,
+             VexArchInfo* archinfo,
+             VexAbiInfo*  abiinfo
+          )
+{
+   UChar     opc1;
+   UInt      opc2;
+   DisResult dres;
+   UInt      theInstr;
+   IRType    ty = mode64 ? Ity_I64 : Ity_I32;
+   Bool      allow_F  = False;
+   Bool      allow_V  = False;
+   Bool      allow_FX = False;
+   Bool      allow_GX = False;
+   UInt      hwcaps = archinfo->hwcaps;
+   Long      delta;
+
+   /* What insn variants are we supporting today? */
+   if (mode64) {
+      allow_F  = True;
+      allow_V  = (0 != (hwcaps & VEX_HWCAPS_PPC64_V));
+      allow_FX = (0 != (hwcaps & VEX_HWCAPS_PPC64_FX));
+      allow_GX = (0 != (hwcaps & VEX_HWCAPS_PPC64_GX));
+   } else {
+      allow_F  = (0 != (hwcaps & VEX_HWCAPS_PPC32_F));
+      allow_V  = (0 != (hwcaps & VEX_HWCAPS_PPC32_V));
+      allow_FX = (0 != (hwcaps & VEX_HWCAPS_PPC32_FX));
+      allow_GX = (0 != (hwcaps & VEX_HWCAPS_PPC32_GX));
+   }
+
+   /* The running delta */
+   delta = (Long)mkSzAddr(ty, (ULong)delta64);
+
+   /* Set result defaults. */
+   dres.whatNext   = Dis_Continue;
+   dres.len        = 0;
+   dres.continueAt = 0;
+
+   /* At least this is simple on PPC32: insns are all 4 bytes long, and
+      4-aligned.  So just fish the whole thing out of memory right now
+      and have done. */
+   theInstr = getUIntBigendianly( (UChar*)(&guest_code[delta]) );
+
+   if (0) vex_printf("insn: 0x%x\n", theInstr);
+
+   DIP("\t0x%llx:  ", (ULong)guest_CIA_curr_instr);
+
+   /* We may be asked to update the guest CIA before going further. */
+   if (put_IP)
+      putGST( PPC_GST_CIA, mkSzImm(ty, guest_CIA_curr_instr) );
+
+   /* Spot "Special" instructions (see comment at top of file). */
+   {
+      UChar* code = (UChar*)(guest_code + delta);
+      /* Spot the 16-byte preamble: 
+         32-bit mode:
+            54001800  rlwinm 0,0,3,0,0
+            54006800  rlwinm 0,0,13,0,0
+            5400E800  rlwinm 0,0,29,0,0
+            54009800  rlwinm 0,0,19,0,0
+         64-bit mode:
+            78001800  rotldi 0,0,3
+            78006800  rotldi 0,0,13
+            7800E802  rotldi 0,0,61
+            78009802  rotldi 0,0,51
+      */
+      UInt word1 = mode64 ? 0x78001800 : 0x54001800;
+      UInt word2 = mode64 ? 0x78006800 : 0x54006800;
+      UInt word3 = mode64 ? 0x7800E802 : 0x5400E800;
+      UInt word4 = mode64 ? 0x78009802 : 0x54009800;
+      if (getUIntBigendianly(code+ 0) == word1 &&
+          getUIntBigendianly(code+ 4) == word2 &&
+          getUIntBigendianly(code+ 8) == word3 &&
+          getUIntBigendianly(code+12) == word4) {
+         /* Got a "Special" instruction preamble.  Which one is it? */
+         if (getUIntBigendianly(code+16) == 0x7C210B78 /* or 1,1,1 */) {
+            /* %R3 = client_request ( %R4 ) */
+            DIP("r3 = client_request ( %%r4 )\n");
+            delta += 20;
+            irsb->next     = mkSzImm( ty, guest_CIA_bbstart + delta );
+            irsb->jumpkind = Ijk_ClientReq;
+            dres.whatNext  = Dis_StopHere;
+            goto decode_success;
+         }
+         else
+         if (getUIntBigendianly(code+16) == 0x7C421378 /* or 2,2,2 */) {
+            /* %R3 = guest_NRADDR */
+            DIP("r3 = guest_NRADDR\n");
+            delta += 20;
+            dres.len = 20;
+            putIReg(3, IRExpr_Get( OFFB_NRADDR, ty ));
+            goto decode_success;
+         }
+         else
+         if (getUIntBigendianly(code+16) == 0x7C631B78 /* or 3,3,3 */) {
+            /*  branch-and-link-to-noredir %R11 */
+            DIP("branch-and-link-to-noredir r11\n");
+            delta += 20;
+            putGST( PPC_GST_LR, mkSzImm(ty, guest_CIA_bbstart + (Long)delta) );
+            irsb->next     = getIReg(11);
+            irsb->jumpkind = Ijk_NoRedir;
+            dres.whatNext  = Dis_StopHere;
+            goto decode_success;
+         }
+         else
+         if (getUIntBigendianly(code+16) == 0x7C842378 /* or 4,4,4 */) {
+            /* %R3 = guest_NRADDR_GPR2 */
+            DIP("r3 = guest_NRADDR_GPR2\n");
+            delta += 20;
+            dres.len = 20;
+            putIReg(3, IRExpr_Get( OFFB_NRADDR_GPR2, ty ));
+            goto decode_success;
+         }
+         /* We don't know what it is.  Set opc1/opc2 so decode_failure
+            can print the insn following the Special-insn preamble. */
+         theInstr = getUIntBigendianly(code+16);
+         opc1     = ifieldOPC(theInstr);
+         opc2     = ifieldOPClo10(theInstr);
+         goto decode_failure;
+         /*NOTREACHED*/
+      }
+   }
+
+   opc1 = ifieldOPC(theInstr);
+   opc2 = ifieldOPClo10(theInstr);
+
+   // Note: all 'reserved' bits must be cleared, else invalid
+   switch (opc1) {
+
+   /* Integer Arithmetic Instructions */
+   case 0x0C: case 0x0D: case 0x0E:  // addic, addic., addi
+   case 0x0F: case 0x07: case 0x08:  // addis, mulli,  subfic
+      if (dis_int_arith( theInstr )) goto decode_success;
+      goto decode_failure;
+
+   /* Integer Compare Instructions */
+   case 0x0B: case 0x0A: // cmpi, cmpli
+      if (dis_int_cmp( theInstr )) goto decode_success;
+      goto decode_failure;
+
+   /* Integer Logical Instructions */
+   case 0x1C: case 0x1D: case 0x18: // andi., andis., ori
+   case 0x19: case 0x1A: case 0x1B: // oris,  xori,   xoris
+      if (dis_int_logic( theInstr )) goto decode_success;
+      goto decode_failure;
+
+   /* Integer Rotate Instructions */
+   case 0x14: case 0x15:  case 0x17: // rlwimi, rlwinm, rlwnm
+      if (dis_int_rot( theInstr )) goto decode_success;
+      goto decode_failure;
+
+   /* 64bit Integer Rotate Instructions */
+   case 0x1E: // rldcl, rldcr, rldic, rldicl, rldicr, rldimi
+      if (dis_int_rot( theInstr )) goto decode_success;
+      goto decode_failure;
+
+   /* Integer Load Instructions */
+   case 0x22: case 0x23: case 0x2A: // lbz,  lbzu, lha
+   case 0x2B: case 0x28: case 0x29: // lhau, lhz,  lhzu
+   case 0x20: case 0x21:            // lwz,  lwzu
+      if (dis_int_load( theInstr )) goto decode_success;
+      goto decode_failure;
+
+   /* Integer Store Instructions */
+   case 0x26: case 0x27: case 0x2C: // stb,  stbu, sth
+   case 0x2D: case 0x24: case 0x25: // sthu, stw,  stwu
+      if (dis_int_store( theInstr, abiinfo )) goto decode_success;
+      goto decode_failure;
+
+   /* Integer Load and Store Multiple Instructions */
+   case 0x2E: case 0x2F: // lmw, stmw
+      if (dis_int_ldst_mult( theInstr )) goto decode_success;
+      goto decode_failure;
+
+   /* Branch Instructions */
+   case 0x12: case 0x10: // b, bc
+      if (dis_branch(theInstr, abiinfo, &dres, 
+                               resteerOkFn, callback_opaque)) 
+         goto decode_success;
+      goto decode_failure;
+
+   /* System Linkage Instructions */
+   case 0x11: // sc
+      if (dis_syslink(theInstr, abiinfo, &dres)) goto decode_success;
+      goto decode_failure;
+
+   /* Trap Instructions */
+   case 0x02: case 0x03: // tdi, twi
+      if (dis_trapi(theInstr, &dres)) goto decode_success;
+      goto decode_failure;
+
+   /* Floating Point Load Instructions */
+   case 0x30: case 0x31: case 0x32: // lfs, lfsu, lfd
+   case 0x33:                       // lfdu
+      if (!allow_F) goto decode_noF;
+      if (dis_fp_load( theInstr )) goto decode_success;
+      goto decode_failure;
+
+   /* Floating Point Store Instructions */
+   case 0x34: case 0x35: case 0x36: // stfsx, stfsux, stfdx
+   case 0x37:                       // stfdux
+      if (!allow_F) goto decode_noF;
+      if (dis_fp_store( theInstr )) goto decode_success;
+      goto decode_failure;
+
+      /* Floating Point Load Double Pair Instructions */
+   case 0x39: case 0x3D:
+      if (!allow_F) goto decode_noF;
+      if (dis_fp_pair( theInstr )) goto decode_success;
+      goto decode_failure;
+
+   /* 64bit Integer Loads */
+   case 0x3A:  // ld, ldu, lwa
+      if (!mode64) goto decode_failure;
+      if (dis_int_load( theInstr )) goto decode_success;
+      goto decode_failure;
+
+   case 0x3B:
+      if (!allow_F) goto decode_noF;
+      opc2 = IFIELD(theInstr, 1, 5);
+      switch (opc2) {
+      /* Floating Point Arith Instructions */
+      case 0x12: case 0x14: case 0x15: // fdivs,  fsubs, fadds
+      case 0x19:                       // fmuls
+         if (dis_fp_arith(theInstr)) goto decode_success;
+         goto decode_failure;
+      case 0x16:                       // fsqrts
+         if (!allow_FX) goto decode_noFX;
+         if (dis_fp_arith(theInstr)) goto decode_success;
+         goto decode_failure;
+      case 0x18:                       // fres
+         if (!allow_GX) goto decode_noGX;
+         if (dis_fp_arith(theInstr)) goto decode_success;
+         goto decode_failure;
+
+      /* Floating Point Mult-Add Instructions */
+      case 0x1C: case 0x1D: case 0x1E: // fmsubs, fmadds, fnmsubs
+      case 0x1F:                       // fnmadds
+         if (dis_fp_multadd(theInstr)) goto decode_success;
+         goto decode_failure;
+
+      case 0x1A:                       // frsqrtes
+         if (!allow_GX) goto decode_noGX;
+         if (dis_fp_arith(theInstr)) goto decode_success;
+         goto decode_failure;
+         
+      default:
+         goto decode_failure;
+      }
+      break;
+
+   /* 64bit Integer Stores */
+   case 0x3E:  // std, stdu
+      if (!mode64) goto decode_failure;
+      if (dis_int_store( theInstr, abiinfo )) goto decode_success;
+      goto decode_failure;
+
+   case 0x3F:
+      if (!allow_F) goto decode_noF;
+      /* Instrs using opc[1:5] never overlap instrs using opc[1:10],
+         so we can simply fall through the first switch statement */
+
+      opc2 = IFIELD(theInstr, 1, 5);
+      switch (opc2) {
+      /* Floating Point Arith Instructions */
+      case 0x12: case 0x14: case 0x15: // fdiv, fsub, fadd
+      case 0x19:                       // fmul
+         if (dis_fp_arith(theInstr)) goto decode_success;
+         goto decode_failure;
+      case 0x16:                       // fsqrt
+         if (!allow_FX) goto decode_noFX;
+         if (dis_fp_arith(theInstr)) goto decode_success;
+         goto decode_failure;
+      case 0x17: case 0x1A:            // fsel, frsqrte
+         if (!allow_GX) goto decode_noGX;
+         if (dis_fp_arith(theInstr)) goto decode_success;
+         goto decode_failure;
+         
+      /* Floating Point Mult-Add Instructions */         
+      case 0x1C: case 0x1D: case 0x1E: // fmsub, fmadd, fnmsub
+      case 0x1F:                       // fnmadd
+         if (dis_fp_multadd(theInstr)) goto decode_success;
+         goto decode_failure;
+
+      case 0x18:                       // fre
+         if (!allow_GX) goto decode_noGX;
+         if (dis_fp_arith(theInstr)) goto decode_success;
+         goto decode_failure;
+         
+      default:
+         break; // Fall through
+      }
+
+      opc2 = IFIELD(theInstr, 1, 10);
+      switch (opc2) {
+      /* Floating Point Compare Instructions */         
+      case 0x000: // fcmpu
+      case 0x020: // fcmpo
+         if (dis_fp_cmp(theInstr)) goto decode_success;
+         goto decode_failure;
+         
+      /* Floating Point Rounding/Conversion Instructions */         
+      case 0x00C: // frsp
+      case 0x00E: // fctiw
+      case 0x00F: // fctiwz
+      case 0x32E: // fctid
+      case 0x32F: // fctidz
+      case 0x34E: // fcfid
+         if (dis_fp_round(theInstr)) goto decode_success;
+         goto decode_failure;
+
+      /* Power6 rounding stuff */
+      case 0x1E8: // frim
+      case 0x1C8: // frip
+      case 0x188: // frin
+      case 0x1A8: // friz
+         /* A hack to check for P6 capability . . . */
+         if ((allow_F && allow_V && allow_FX && allow_GX) &&
+             (dis_fp_round(theInstr)))
+            goto decode_success;
+         goto decode_failure;
+         
+      /* Floating Point Move Instructions */         
+      case 0x008: // fcpsgn
+      case 0x028: // fneg
+      case 0x048: // fmr
+      case 0x088: // fnabs
+      case 0x108: // fabs
+         if (dis_fp_move( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* Floating Point Status/Control Register Instructions */         
+      case 0x026: // mtfsb1
+      case 0x040: // mcrfs
+      case 0x046: // mtfsb0
+      case 0x086: // mtfsfi
+      case 0x247: // mffs
+      case 0x2C7: // mtfsf
+         if (dis_fp_scr( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      default:
+         goto decode_failure;
+      }
+      break;
+      
+   case 0x13:
+      switch (opc2) {
+
+      /* Condition Register Logical Instructions */
+      case 0x101: case 0x081: case 0x121: // crand,  crandc, creqv
+      case 0x0E1: case 0x021: case 0x1C1: // crnand, crnor,  cror
+      case 0x1A1: case 0x0C1: case 0x000: // crorc,  crxor,  mcrf
+         if (dis_cond_logic( theInstr )) goto decode_success;
+         goto decode_failure;
+         
+      /* Branch Instructions */
+      case 0x210: case 0x010: // bcctr, bclr
+         if (dis_branch(theInstr, abiinfo, &dres, 
+                                  resteerOkFn, callback_opaque)) 
+            goto decode_success;
+         goto decode_failure;
+         
+      /* Memory Synchronization Instructions */
+      case 0x096: // isync
+         if (dis_memsync( theInstr )) goto decode_success;
+         goto decode_failure;
+         
+      default:
+         goto decode_failure;
+      }
+      break;
+
+
+   case 0x1F:
+
+      /* For arith instns, bit10 is the OE flag (overflow enable) */
+
+      opc2 = IFIELD(theInstr, 1, 9);
+      switch (opc2) {
+      /* Integer Arithmetic Instructions */
+      case 0x10A: case 0x00A: case 0x08A: // add,   addc,  adde
+      case 0x0EA: case 0x0CA: case 0x1EB: // addme, addze, divw
+      case 0x1CB: case 0x04B: case 0x00B: // divwu, mulhw, mulhwu
+      case 0x0EB: case 0x068: case 0x028: // mullw, neg,   subf
+      case 0x008: case 0x088: case 0x0E8: // subfc, subfe, subfme
+      case 0x0C8:                         // subfze
+         if (dis_int_arith( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* 64bit Integer Arithmetic */
+      case 0x009: case 0x049: case 0x0E9: // mulhdu, mulhd, mulld
+      case 0x1C9: case 0x1E9:             // divdu, divd
+         if (!mode64) goto decode_failure;
+         if (dis_int_arith( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      case 0x1FC:                         // cmpb
+         if (dis_int_logic( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      default:
+         break;  // Fall through...
+      }
+
+      /* All remaining opcodes use full 10 bits. */
+
+      opc2 = IFIELD(theInstr, 1, 10);
+      switch (opc2) {
+      /* Integer Compare Instructions  */
+      case 0x000: case 0x020: // cmp, cmpl
+         if (dis_int_cmp( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* Integer Logical Instructions */
+      case 0x01C: case 0x03C: case 0x01A: // and,  andc,  cntlzw
+      case 0x11C: case 0x3BA: case 0x39A: // eqv,  extsb, extsh
+      case 0x1DC: case 0x07C: case 0x1BC: // nand, nor,   or
+      case 0x19C: case 0x13C:             // orc,  xor
+      case 0x2DF: case 0x25F:            // mftgpr, mffgpr
+         if (dis_int_logic( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* 64bit Integer Logical Instructions */
+      case 0x3DA: case 0x03A: // extsw, cntlzd
+         if (!mode64) goto decode_failure;
+         if (dis_int_logic( theInstr )) goto decode_success;
+         goto decode_failure;
+
+         /* 64bit Integer Parity Instructions */
+      case 0xba: case 0x9a: // prtyd, prtyw
+         if (dis_int_parity( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* Integer Shift Instructions */
+      case 0x018: case 0x318: case 0x338: // slw, sraw, srawi
+      case 0x218:                         // srw
+         if (dis_int_shift( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* 64bit Integer Shift Instructions */
+      case 0x01B: case 0x31A: // sld, srad
+      case 0x33A: case 0x33B: // sradi
+      case 0x21B:             // srd
+         if (!mode64) goto decode_failure;
+         if (dis_int_shift( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* Integer Load Instructions */
+      case 0x057: case 0x077: case 0x157: // lbzx,  lbzux, lhax
+      case 0x177: case 0x117: case 0x137: // lhaux, lhzx,  lhzux
+      case 0x017: case 0x037:             // lwzx,  lwzux
+         if (dis_int_load( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* 64bit Integer Load Instructions */
+      case 0x035: case 0x015:             // ldux,  ldx
+      case 0x175: case 0x155:             // lwaux, lwax
+         if (!mode64) goto decode_failure;
+         if (dis_int_load( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* Integer Store Instructions */
+      case 0x0F7: case 0x0D7: case 0x1B7: // stbux, stbx,  sthux
+      case 0x197: case 0x0B7: case 0x097: // sthx,  stwux, stwx
+         if (dis_int_store( theInstr, abiinfo )) goto decode_success;
+         goto decode_failure;
+
+      /* 64bit Integer Store Instructions */
+      case 0x0B5: case 0x095: // stdux, stdx
+         if (!mode64) goto decode_failure;
+         if (dis_int_store( theInstr, abiinfo )) goto decode_success;
+         goto decode_failure;
+
+      /* Integer Load and Store with Byte Reverse Instructions */
+      case 0x316: case 0x216: case 0x396: // lhbrx, lwbrx, sthbrx
+      case 0x296:                         // stwbrx
+         if (dis_int_ldst_rev( theInstr )) goto decode_success;
+         goto decode_failure;
+         
+      /* Integer Load and Store String Instructions */
+      case 0x255: case 0x215: case 0x2D5: // lswi, lswx, stswi
+      case 0x295: {                       // stswx
+         Bool stopHere = False;
+         Bool ok = dis_int_ldst_str( theInstr, &stopHere );
+         if (!ok) goto decode_failure;
+         if (stopHere) {
+            irsb->next     = mkSzImm(ty, nextInsnAddr());
+            irsb->jumpkind = Ijk_Boring;
+            dres.whatNext  = Dis_StopHere;
+         }
+         goto decode_success;
+      }
+
+      /* Memory Synchronization Instructions */
+      case 0x356: case 0x014: case 0x096: // eieio, lwarx, stwcx.
+      case 0x256:                         // sync
+         if (dis_memsync( theInstr )) goto decode_success;
+         goto decode_failure;
+         
+      /* 64bit Memory Synchronization Instructions */
+      case 0x054: case 0x0D6: // ldarx, stdcx.
+         if (!mode64) goto decode_failure;
+         if (dis_memsync( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* Processor Control Instructions */
+      case 0x200: case 0x013: case 0x153: // mcrxr, mfcr,  mfspr
+      case 0x173: case 0x090: case 0x1D3: // mftb,  mtcrf, mtspr
+         if (dis_proc_ctl( abiinfo, theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* Cache Management Instructions */
+      case 0x2F6: case 0x056: case 0x036: // dcba, dcbf,   dcbst
+      case 0x116: case 0x0F6: case 0x3F6: // dcbt, dcbtst, dcbz
+      case 0x3D6:                         // icbi
+         if (dis_cache_manage( theInstr, &dres, archinfo )) 
+            goto decode_success;
+         goto decode_failure;
+
+//zz       /* External Control Instructions */
+//zz       case 0x136: case 0x1B6: // eciwx, ecowx
+//zz          DIP("external control op => not implemented\n");
+//zz          goto decode_failure;
+
+      /* Trap Instructions */
+      case 0x004: case 0x044:             // tw,   td
+         if (dis_trap(theInstr, &dres)) goto decode_success;
+         goto decode_failure;
+
+      /* Floating Point Load Instructions */
+      case 0x217: case 0x237: case 0x257: // lfsx, lfsux, lfdx
+      case 0x277:                         // lfdux
+         if (!allow_F) goto decode_noF;
+         if (dis_fp_load( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* Floating Point Store Instructions */
+      case 0x297: case 0x2B7: case 0x2D7: // stfs,  stfsu, stfd
+      case 0x2F7:                         // stfdu, stfiwx
+         if (!allow_F) goto decode_noF;
+         if (dis_fp_store( theInstr )) goto decode_success;
+         goto decode_failure;
+      case 0x3D7:                         // stfiwx
+         if (!allow_F) goto decode_noF;
+         if (!allow_GX) goto decode_noGX;
+         if (dis_fp_store( theInstr )) goto decode_success;
+         goto decode_failure;
+
+         /* Floating Point Double Pair Indexed Instructions */
+      case 0x317: // lfdpx (Power6)
+      case 0x397: // stfdpx (Power6)
+         if (!allow_F) goto decode_noF;
+         if (dis_fp_pair(theInstr)) goto decode_success;
+         goto decode_failure;
+
+      case 0x357:                         // lfiwax
+         if (!allow_F) goto decode_noF;
+         if (dis_fp_load( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AltiVec instructions */
+
+      /* AV Cache Control - Data streams */
+      case 0x156: case 0x176: case 0x336: // dst, dstst, dss
+         if (!allow_V) goto decode_noV;
+         if (dis_av_datastream( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AV Load */
+      case 0x006: case 0x026:             // lvsl, lvsr
+      case 0x007: case 0x027: case 0x047: // lvebx, lvehx, lvewx
+      case 0x067: case 0x167:             // lvx, lvxl
+         if (!allow_V) goto decode_noV;
+         if (dis_av_load( abiinfo, theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AV Store */
+      case 0x087: case 0x0A7: case 0x0C7: // stvebx, stvehx, stvewx
+      case 0x0E7: case 0x1E7:             // stvx, stvxl
+         if (!allow_V) goto decode_noV;
+         if (dis_av_store( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      default:
+         /* Deal with some other cases that we would otherwise have
+            punted on. */
+         /* --- ISEL (PowerISA_V2.05.pdf, p74) --- */
+         /* only decode this insn when reserved bit 0 (31 in IBM's
+            notation) is zero */
+         if (IFIELD(theInstr, 0, 6) == (15<<1)) {
+            UInt rT = ifieldRegDS( theInstr );
+            UInt rA = ifieldRegA( theInstr );
+            UInt rB = ifieldRegB( theInstr );
+            UInt bi = ifieldRegC( theInstr );
+            putIReg(
+               rT,
+               IRExpr_Mux0X( unop(Iop_32to8,getCRbit( bi )),
+                             getIReg(rB),
+                             rA == 0 ? (mode64 ? mkU64(0) : mkU32(0))
+                                     : getIReg(rA) )
+            );
+            DIP("isel r%u,r%u,r%u,crb%u\n", rT,rA,rB,bi);
+            goto decode_success;
+         }
+         goto decode_failure;
+      }
+      break;
+
+
+   case 0x04:
+      /* AltiVec instructions */
+
+      opc2 = IFIELD(theInstr, 0, 6);
+      switch (opc2) {
+      /* AV Mult-Add, Mult-Sum */
+      case 0x20: case 0x21: case 0x22: // vmhaddshs, vmhraddshs, vmladduhm
+      case 0x24: case 0x25: case 0x26: // vmsumubm, vmsummbm, vmsumuhm
+      case 0x27: case 0x28: case 0x29: // vmsumuhs, vmsumshm, vmsumshs
+         if (!allow_V) goto decode_noV;
+         if (dis_av_multarith( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AV Permutations */
+      case 0x2A:                       // vsel
+      case 0x2B:                       // vperm
+      case 0x2C:                       // vsldoi
+         if (!allow_V) goto decode_noV;
+         if (dis_av_permute( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AV Floating Point Mult-Add/Sub */
+      case 0x2E: case 0x2F:            // vmaddfp, vnmsubfp
+         if (!allow_V) goto decode_noV;
+         if (dis_av_fp_arith( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      default:
+         break;  // Fall through...
+      }
+
+      opc2 = IFIELD(theInstr, 0, 11);
+      switch (opc2) {
+      /* AV Arithmetic */
+      case 0x180:                         // vaddcuw
+      case 0x000: case 0x040: case 0x080: // vaddubm, vadduhm, vadduwm
+      case 0x200: case 0x240: case 0x280: // vaddubs, vadduhs, vadduws
+      case 0x300: case 0x340: case 0x380: // vaddsbs, vaddshs, vaddsws
+      case 0x580:                         // vsubcuw
+      case 0x400: case 0x440: case 0x480: // vsububm, vsubuhm, vsubuwm
+      case 0x600: case 0x640: case 0x680: // vsububs, vsubuhs, vsubuws
+      case 0x700: case 0x740: case 0x780: // vsubsbs, vsubshs, vsubsws
+      case 0x402: case 0x442: case 0x482: // vavgub, vavguh, vavguw
+      case 0x502: case 0x542: case 0x582: // vavgsb, vavgsh, vavgsw
+      case 0x002: case 0x042: case 0x082: // vmaxub, vmaxuh, vmaxuw
+      case 0x102: case 0x142: case 0x182: // vmaxsb, vmaxsh, vmaxsw
+      case 0x202: case 0x242: case 0x282: // vminub, vminuh, vminuw
+      case 0x302: case 0x342: case 0x382: // vminsb, vminsh, vminsw
+      case 0x008: case 0x048:             // vmuloub, vmulouh
+      case 0x108: case 0x148:             // vmulosb, vmulosh
+      case 0x208: case 0x248:             // vmuleub, vmuleuh
+      case 0x308: case 0x348:             // vmulesb, vmulesh
+      case 0x608: case 0x708: case 0x648: // vsum4ubs, vsum4sbs, vsum4shs
+      case 0x688: case 0x788:             // vsum2sws, vsumsws
+         if (!allow_V) goto decode_noV;
+         if (dis_av_arith( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AV Rotate, Shift */
+      case 0x004: case 0x044: case 0x084: // vrlb, vrlh, vrlw
+      case 0x104: case 0x144: case 0x184: // vslb, vslh, vslw
+      case 0x204: case 0x244: case 0x284: // vsrb, vsrh, vsrw
+      case 0x304: case 0x344: case 0x384: // vsrab, vsrah, vsraw
+      case 0x1C4: case 0x2C4:             // vsl, vsr
+      case 0x40C: case 0x44C:             // vslo, vsro
+         if (!allow_V) goto decode_noV;
+         if (dis_av_shift( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AV Logic */
+      case 0x404: case 0x444: case 0x484: // vand, vandc, vor
+      case 0x4C4: case 0x504:             // vxor, vnor
+         if (!allow_V) goto decode_noV;
+         if (dis_av_logic( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AV Processor Control */
+      case 0x604: case 0x644:             // mfvscr, mtvscr
+         if (!allow_V) goto decode_noV;
+         if (dis_av_procctl( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AV Floating Point Arithmetic */
+      case 0x00A: case 0x04A:             // vaddfp, vsubfp
+      case 0x10A: case 0x14A: case 0x18A: // vrefp, vrsqrtefp, vexptefp
+      case 0x1CA:                         // vlogefp
+      case 0x40A: case 0x44A:             // vmaxfp, vminfp
+         if (!allow_V) goto decode_noV;
+         if (dis_av_fp_arith( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AV Floating Point Round/Convert */
+      case 0x20A: case 0x24A: case 0x28A: // vrfin, vrfiz, vrfip
+      case 0x2CA:                         // vrfim
+      case 0x30A: case 0x34A: case 0x38A: // vcfux, vcfsx, vctuxs
+      case 0x3CA:                         // vctsxs
+         if (!allow_V) goto decode_noV;
+         if (dis_av_fp_convert( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AV Merge, Splat */
+      case 0x00C: case 0x04C: case 0x08C: // vmrghb, vmrghh, vmrghw
+      case 0x10C: case 0x14C: case 0x18C: // vmrglb, vmrglh, vmrglw
+      case 0x20C: case 0x24C: case 0x28C: // vspltb, vsplth, vspltw
+      case 0x30C: case 0x34C: case 0x38C: // vspltisb, vspltish, vspltisw
+         if (!allow_V) goto decode_noV;
+         if (dis_av_permute( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AV Pack, Unpack */
+      case 0x00E: case 0x04E: case 0x08E: // vpkuhum, vpkuwum, vpkuhus
+      case 0x0CE:                         // vpkuwus
+      case 0x10E: case 0x14E: case 0x18E: // vpkshus, vpkswus, vpkshss
+      case 0x1CE:                         // vpkswss
+      case 0x20E: case 0x24E: case 0x28E: // vupkhsb, vupkhsh, vupklsb
+      case 0x2CE:                         // vupklsh
+      case 0x30E: case 0x34E: case 0x3CE: // vpkpx, vupkhpx, vupklpx
+         if (!allow_V) goto decode_noV;
+         if (dis_av_pack( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      default:
+         break;  // Fall through...
+      }
+
+      opc2 = IFIELD(theInstr, 0, 10);
+      switch (opc2) {
+
+      /* AV Compare */
+      case 0x006: case 0x046: case 0x086: // vcmpequb, vcmpequh, vcmpequw
+      case 0x206: case 0x246: case 0x286: // vcmpgtub, vcmpgtuh, vcmpgtuw
+      case 0x306: case 0x346: case 0x386: // vcmpgtsb, vcmpgtsh, vcmpgtsw
+         if (!allow_V) goto decode_noV;
+         if (dis_av_cmp( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      /* AV Floating Point Compare */
+      case 0x0C6: case 0x1C6: case 0x2C6: // vcmpeqfp, vcmpgefp, vcmpgtfp
+      case 0x3C6:                         // vcmpbfp
+         if (!allow_V) goto decode_noV;
+         if (dis_av_fp_cmp( theInstr )) goto decode_success;
+         goto decode_failure;
+
+      default:
+         goto decode_failure;
+      }
+      break;
+
+   default:
+      goto decode_failure;
+
+   decode_noF:
+      vassert(!allow_F);
+      vex_printf("disInstr(ppc): declined to decode an FP insn.\n");
+      goto decode_failure;
+   decode_noV:
+      vassert(!allow_V);
+      vex_printf("disInstr(ppc): declined to decode an AltiVec insn.\n");
+      goto decode_failure;
+   decode_noFX:
+      vassert(!allow_FX);
+      vex_printf("disInstr(ppc): "
+                 "declined to decode a GeneralPurpose-Optional insn.\n");
+      goto decode_failure;
+   decode_noGX:
+      vassert(!allow_GX);
+      vex_printf("disInstr(ppc): "
+                 "declined to decode a Graphics-Optional insn.\n");
+      goto decode_failure;
+
+   decode_failure:
+   /* All decode failures end up here. */
+   opc2 = (theInstr) & 0x7FF;
+   vex_printf("disInstr(ppc): unhandled instruction: "
+              "0x%x\n", theInstr);
+   vex_printf("                 primary %d(0x%x), secondary %u(0x%x)\n", 
+              opc1, opc1, opc2, opc2);
+
+   /* Tell the dispatcher that this insn cannot be decoded, and so has
+      not been executed, and (is currently) the next to be executed.
+      CIA should be up-to-date since it made so at the start of each
+      insn, but nevertheless be paranoid and update it again right
+      now. */
+   putGST( PPC_GST_CIA, mkSzImm(ty, guest_CIA_curr_instr) );
+   irsb->next     = mkSzImm(ty, guest_CIA_curr_instr);
+   irsb->jumpkind = Ijk_NoDecode;
+   dres.whatNext  = Dis_StopHere;
+   dres.len       = 0;
+   return dres;
+
+   } /* switch (opc) for the main (primary) opcode switch. */
+
+  decode_success:
+   /* All decode successes end up here. */
+   DIP("\n");
+
+   if (dres.len == 0) {
+      dres.len = 4;
+   } else {
+      vassert(dres.len == 20);
+   }
+   return dres;
+}
+
+#undef DIP
+#undef DIS
+
+
+/*------------------------------------------------------------*/
+/*--- Top-level fn                                         ---*/
+/*------------------------------------------------------------*/
+
+/* Disassemble a single instruction into IR.  The instruction
+   is located in host memory at &guest_code[delta]. */
+
+DisResult disInstr_PPC ( IRSB*        irsb_IN,
+                         Bool         put_IP,
+                         Bool         (*resteerOkFn) ( void*, Addr64 ),
+                         Bool         resteerCisOk,
+                         void*        callback_opaque,
+                         UChar*       guest_code_IN,
+                         Long         delta,
+                         Addr64       guest_IP,
+                         VexArch      guest_arch,
+                         VexArchInfo* archinfo,
+                         VexAbiInfo*  abiinfo,
+                         Bool         host_bigendian_IN )
+{
+   IRType     ty;
+   DisResult  dres;
+   UInt       mask32, mask64;
+   UInt hwcaps_guest = archinfo->hwcaps;
+
+   vassert(guest_arch == VexArchPPC32 || guest_arch == VexArchPPC64);
+
+   /* global -- ick */
+   mode64 = guest_arch == VexArchPPC64;
+   ty = mode64 ? Ity_I64 : Ity_I32;
+
+   /* do some sanity checks */
+   mask32 = VEX_HWCAPS_PPC32_F | VEX_HWCAPS_PPC32_V
+            | VEX_HWCAPS_PPC32_FX | VEX_HWCAPS_PPC32_GX;
+
+   mask64 = VEX_HWCAPS_PPC64_V
+            | VEX_HWCAPS_PPC64_FX | VEX_HWCAPS_PPC64_GX;
+
+   if (mode64) {
+      vassert((hwcaps_guest & mask32) == 0);
+   } else {
+      vassert((hwcaps_guest & mask64) == 0);
+   }
+
+   /* Set globals (see top of this file) */
+   guest_code           = guest_code_IN;
+   irsb                 = irsb_IN;
+   host_is_bigendian    = host_bigendian_IN;
+
+   guest_CIA_curr_instr = mkSzAddr(ty, guest_IP);
+   guest_CIA_bbstart    = mkSzAddr(ty, guest_IP - delta);
+
+   dres = disInstr_PPC_WRK ( put_IP, 
+                             resteerOkFn, resteerCisOk, callback_opaque,
+                             delta, archinfo, abiinfo );
+
+   return dres;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Unused stuff                                         ---*/
+/*------------------------------------------------------------*/
+
+///* A potentially more memcheck-friendly implementation of Clz32, with
+//   the boundary case Clz32(0) = 32, which is what ppc requires. */
+//
+//static IRExpr* /* :: Ity_I32 */ verbose_Clz32 ( IRTemp arg )
+//{
+//   /* Welcome ... to SSA R Us. */
+//   IRTemp n1  = newTemp(Ity_I32);
+//   IRTemp n2  = newTemp(Ity_I32);
+//   IRTemp n3  = newTemp(Ity_I32);
+//   IRTemp n4  = newTemp(Ity_I32);
+//   IRTemp n5  = newTemp(Ity_I32);
+//   IRTemp n6  = newTemp(Ity_I32);
+//   IRTemp n7  = newTemp(Ity_I32);
+//   IRTemp n8  = newTemp(Ity_I32);
+//   IRTemp n9  = newTemp(Ity_I32);
+//   IRTemp n10 = newTemp(Ity_I32);
+//   IRTemp n11 = newTemp(Ity_I32);
+//   IRTemp n12 = newTemp(Ity_I32);
+//
+//   /* First, propagate the most significant 1-bit into all lower
+//      positions in the word. */
+//   /* unsigned int clz ( unsigned int n )
+//      {
+//         n |= (n >> 1);
+//         n |= (n >> 2);
+//         n |= (n >> 4);
+//         n |= (n >> 8);
+//         n |= (n >> 16);
+//         return bitcount(~n);
+//      }
+//   */
+//   assign(n1, mkexpr(arg));
+//   assign(n2, binop(Iop_Or32, mkexpr(n1), binop(Iop_Shr32, mkexpr(n1), mkU8(1))));
+//   assign(n3, binop(Iop_Or32, mkexpr(n2), binop(Iop_Shr32, mkexpr(n2), mkU8(2))));
+//   assign(n4, binop(Iop_Or32, mkexpr(n3), binop(Iop_Shr32, mkexpr(n3), mkU8(4))));
+//   assign(n5, binop(Iop_Or32, mkexpr(n4), binop(Iop_Shr32, mkexpr(n4), mkU8(8))));
+//   assign(n6, binop(Iop_Or32, mkexpr(n5), binop(Iop_Shr32, mkexpr(n5), mkU8(16))));
+//   /* This gives a word of the form 0---01---1.  Now invert it, giving
+//      a word of the form 1---10---0, then do a population-count idiom
+//      (to count the 1s, which is the number of leading zeroes, or 32
+//      if the original word was 0. */
+//   assign(n7, unop(Iop_Not32, mkexpr(n6)));
+//
+//   /* unsigned int bitcount ( unsigned int n )
+//      {
+//         n = n - ((n >> 1) & 0x55555555);
+//         n = (n & 0x33333333) + ((n >> 2) & 0x33333333);
+//         n = (n + (n >> 4)) & 0x0F0F0F0F;
+//         n = n + (n >> 8);
+//         n = (n + (n >> 16)) & 0x3F;
+//         return n;
+//      }
+//   */
+//   assign(n8, 
+//          binop(Iop_Sub32, 
+//                mkexpr(n7),  
+//                binop(Iop_And32, 
+//                      binop(Iop_Shr32, mkexpr(n7), mkU8(1)),
+//                      mkU32(0x55555555))));
+//   assign(n9,
+//          binop(Iop_Add32,
+//                binop(Iop_And32, mkexpr(n8), mkU32(0x33333333)),
+//                binop(Iop_And32,
+//                      binop(Iop_Shr32, mkexpr(n8), mkU8(2)),
+//                      mkU32(0x33333333))));
+//   assign(n10,
+//          binop(Iop_And32,
+//                binop(Iop_Add32, 
+//                      mkexpr(n9), 
+//                      binop(Iop_Shr32, mkexpr(n9), mkU8(4))),
+//                mkU32(0x0F0F0F0F)));
+//   assign(n11,
+//          binop(Iop_Add32,
+//                mkexpr(n10),
+//                binop(Iop_Shr32, mkexpr(n10), mkU8(8))));
+//   assign(n12,
+//          binop(Iop_Add32,
+//                mkexpr(n11),
+//                binop(Iop_Shr32, mkexpr(n11), mkU8(16))));
+//   return
+//      binop(Iop_And32, mkexpr(n12), mkU32(0x3F));
+//}
+
+/*--------------------------------------------------------------------*/
+/*--- end                                         guest_ppc_toIR.c ---*/
+/*--------------------------------------------------------------------*/

diff --git a/VEX/priv/guest_x86_defs.h b/VEX/priv/guest_x86_defs.h
new file mode 100644
index 0000000..09d647a
--- /dev/null
+++ b/VEX/priv/guest_x86_defs.h

@@ -0,0 +1,412 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                  guest_x86_defs.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+/* Only to be used within the guest-x86 directory. */
+
+#ifndef __VEX_GUEST_X86_DEFS_H
+#define __VEX_GUEST_X86_DEFS_H
+
+
+/*---------------------------------------------------------*/
+/*--- x86 to IR conversion                              ---*/
+/*---------------------------------------------------------*/
+
+/* Convert one x86 insn to IR.  See the type DisOneInstrFn in
+   bb_to_IR.h. */
+extern
+DisResult disInstr_X86 ( IRSB*        irbb,
+                         Bool         put_IP,
+                         Bool         (*resteerOkFn) ( void*, Addr64 ),
+                         Bool         resteerCisOk,
+                         void*        callback_opaque,
+                         UChar*       guest_code,
+                         Long         delta,
+                         Addr64       guest_IP,
+                         VexArch      guest_arch,
+                         VexArchInfo* archinfo,
+                         VexAbiInfo*  abiinfo,
+                         Bool         host_bigendian );
+
+/* Used by the optimiser to specialise calls to helpers. */
+extern
+IRExpr* guest_x86_spechelper ( HChar*   function_name,
+                               IRExpr** args,
+                               IRStmt** precedingStmts,
+                               Int      n_precedingStmts );
+
+/* Describes to the optimiser which part of the guest state require
+   precise memory exceptions.  This is logically part of the guest
+   state description. */
+extern 
+Bool guest_x86_state_requires_precise_mem_exns ( Int, Int );
+
+extern
+VexGuestLayout x86guest_layout;
+
+
+/*---------------------------------------------------------*/
+/*--- x86 guest helpers                                 ---*/
+/*---------------------------------------------------------*/
+
+/* --- CLEAN HELPERS --- */
+
+extern UInt  x86g_calculate_eflags_all ( 
+                UInt cc_op, UInt cc_dep1, UInt cc_dep2, UInt cc_ndep 
+             );
+
+__attribute((regparm(3)))
+extern UInt  x86g_calculate_eflags_c ( 
+                UInt cc_op, UInt cc_dep1, UInt cc_dep2, UInt cc_ndep 
+             );
+
+extern UInt  x86g_calculate_condition ( 
+                UInt/*X86Condcode*/ cond, 
+                UInt cc_op, 
+                UInt cc_dep1, UInt cc_dep2, UInt cc_ndep 
+             );
+
+extern UInt  x86g_calculate_FXAM ( UInt tag, ULong dbl );
+
+extern ULong x86g_calculate_RCR ( 
+                UInt arg, UInt rot_amt, UInt eflags_in, UInt sz 
+             );
+extern ULong x86g_calculate_RCL ( 
+                UInt arg, UInt rot_amt, UInt eflags_in, UInt sz 
+             );
+
+extern UInt x86g_calculate_daa_das_aaa_aas ( UInt AX_and_flags, UInt opcode );
+
+extern ULong x86g_check_fldcw ( UInt fpucw );
+
+extern UInt  x86g_create_fpucw ( UInt fpround );
+
+extern ULong x86g_check_ldmxcsr ( UInt mxcsr );
+
+extern UInt  x86g_create_mxcsr ( UInt sseround );
+
+
+/* Translate a guest virtual_addr into a guest linear address by
+   consulting the supplied LDT/GDT structures.  Their representation
+   must be as specified in pub/libvex_guest_x86.h.  To indicate a
+   translation failure, 1<<32 is returned.  On success, the lower 32
+   bits of the returned result indicate the linear address.  
+*/
+extern 
+ULong x86g_use_seg_selector ( HWord ldt, HWord gdt, 
+                              UInt seg_selector, UInt virtual_addr );
+
+extern ULong x86g_calculate_mmx_pmaddwd  ( ULong, ULong );
+extern ULong x86g_calculate_mmx_psadbw   ( ULong, ULong );
+extern UInt  x86g_calculate_mmx_pmovmskb ( ULong );
+extern UInt  x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo );
+
+
+/* --- DIRTY HELPERS --- */
+
+extern ULong x86g_dirtyhelper_loadF80le  ( UInt );
+
+extern void  x86g_dirtyhelper_storeF80le ( UInt, ULong );
+
+extern void  x86g_dirtyhelper_CPUID_sse0 ( VexGuestX86State* );
+extern void  x86g_dirtyhelper_CPUID_sse1 ( VexGuestX86State* );
+extern void  x86g_dirtyhelper_CPUID_sse2 ( VexGuestX86State* );
+
+extern void  x86g_dirtyhelper_FINIT ( VexGuestX86State* );
+
+extern void  x86g_dirtyhelper_FXSAVE ( VexGuestX86State*, HWord );
+extern void  x86g_dirtyhelper_FSAVE  ( VexGuestX86State*, HWord );
+extern void  x86g_dirtyhelper_FSTENV ( VexGuestX86State*, HWord );
+
+extern ULong x86g_dirtyhelper_RDTSC ( void );
+
+extern UInt x86g_dirtyhelper_IN  ( UInt portno, UInt sz/*1,2 or 4*/ );
+extern void x86g_dirtyhelper_OUT ( UInt portno, UInt data, 
+                                   UInt sz/*1,2 or 4*/ );
+
+extern void x86g_dirtyhelper_SxDT ( void* address,
+                                    UInt op /* 0 or 1 */ );
+
+extern VexEmWarn
+            x86g_dirtyhelper_FXRSTOR ( VexGuestX86State*, HWord );
+
+extern VexEmWarn
+            x86g_dirtyhelper_FRSTOR ( VexGuestX86State*, HWord );
+
+extern VexEmWarn 
+            x86g_dirtyhelper_FLDENV ( VexGuestX86State*, HWord );
+
+
+/*---------------------------------------------------------*/
+/*--- Condition code stuff                              ---*/
+/*---------------------------------------------------------*/
+
+/* eflags masks */
+#define X86G_CC_SHIFT_O   11
+#define X86G_CC_SHIFT_S   7
+#define X86G_CC_SHIFT_Z   6
+#define X86G_CC_SHIFT_A   4
+#define X86G_CC_SHIFT_C   0
+#define X86G_CC_SHIFT_P   2
+
+#define X86G_CC_MASK_O    (1 << X86G_CC_SHIFT_O)
+#define X86G_CC_MASK_S    (1 << X86G_CC_SHIFT_S)
+#define X86G_CC_MASK_Z    (1 << X86G_CC_SHIFT_Z)
+#define X86G_CC_MASK_A    (1 << X86G_CC_SHIFT_A)
+#define X86G_CC_MASK_C    (1 << X86G_CC_SHIFT_C)
+#define X86G_CC_MASK_P    (1 << X86G_CC_SHIFT_P)
+
+/* FPU flag masks */
+#define X86G_FC_SHIFT_C3   14
+#define X86G_FC_SHIFT_C2   10
+#define X86G_FC_SHIFT_C1   9
+#define X86G_FC_SHIFT_C0   8
+
+#define X86G_FC_MASK_C3    (1 << X86G_FC_SHIFT_C3)
+#define X86G_FC_MASK_C2    (1 << X86G_FC_SHIFT_C2)
+#define X86G_FC_MASK_C1    (1 << X86G_FC_SHIFT_C1)
+#define X86G_FC_MASK_C0    (1 << X86G_FC_SHIFT_C0)
+
+
+/* %EFLAGS thunk descriptors.  A four-word thunk is used to record
+   details of the most recent flag-setting operation, so the flags can
+   be computed later if needed.  It is possible to do this a little
+   more efficiently using a 3-word thunk, but that makes it impossible
+   to describe the flag data dependencies sufficiently accurately for
+   Memcheck.  Hence 4 words are used, with minimal loss of efficiency.
+
+   The four words are:
+
+      CC_OP, which describes the operation.
+
+      CC_DEP1 and CC_DEP2.  These are arguments to the operation.
+         We want Memcheck to believe that the resulting flags are
+         data-dependent on both CC_DEP1 and CC_DEP2, hence the 
+         name DEP.
+
+      CC_NDEP.  This is a 3rd argument to the operation which is
+         sometimes needed.  We arrange things so that Memcheck does
+         not believe the resulting flags are data-dependent on CC_NDEP
+         ("not dependent").
+
+   To make Memcheck believe that (the definedness of) the encoded
+   flags depends only on (the definedness of) CC_DEP1 and CC_DEP2
+   requires two things:
+
+   (1) In the guest state layout info (x86guest_layout), CC_OP and
+       CC_NDEP are marked as always defined.
+
+   (2) When passing the thunk components to an evaluation function
+       (calculate_condition, calculate_eflags, calculate_eflags_c) the
+       IRCallee's mcx_mask must be set so as to exclude from
+       consideration all passed args except CC_DEP1 and CC_DEP2.
+
+   Strictly speaking only (2) is necessary for correctness.  However,
+   (1) helps efficiency in that since (2) means we never ask about the
+   definedness of CC_OP or CC_NDEP, we may as well not even bother to
+   track their definedness.
+
+   When building the thunk, it is always necessary to write words into
+   CC_DEP1 and CC_DEP2, even if those args are not used given the
+   CC_OP field (eg, CC_DEP2 is not used if CC_OP is CC_LOGIC1/2/4).
+   This is important because otherwise Memcheck could give false
+   positives as it does not understand the relationship between the
+   CC_OP field and CC_DEP1 and CC_DEP2, and so believes that the 
+   definedness of the stored flags always depends on both CC_DEP1 and
+   CC_DEP2.
+
+   However, it is only necessary to set CC_NDEP when the CC_OP value
+   requires it, because Memcheck ignores CC_NDEP, and the evaluation
+   functions do understand the CC_OP fields and will only examine
+   CC_NDEP for suitable values of CC_OP.
+
+   A summary of the field usages is:
+
+   Operation          DEP1               DEP2               NDEP
+   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+   add/sub/mul        first arg          second arg         unused
+
+   adc/sbb            first arg          (second arg)
+                                         XOR old_carry      old_carry
+
+   and/or/xor         result             zero               unused
+
+   inc/dec            result             zero               old_carry
+
+   shl/shr/sar        result             subshifted-        unused
+                                         result
+
+   rol/ror            result             zero               old_flags
+
+   copy               old_flags          zero               unused.
+
+
+   Therefore Memcheck will believe the following:
+
+   * add/sub/mul -- definedness of result flags depends on definedness
+     of both args.
+
+   * adc/sbb -- definedness of result flags depends on definedness of
+     both args and definedness of the old C flag.  Because only two
+     DEP fields are available, the old C flag is XOR'd into the second
+     arg so that Memcheck sees the data dependency on it.  That means
+     the NDEP field must contain a second copy of the old C flag
+     so that the evaluation functions can correctly recover the second
+     arg.
+
+   * and/or/xor are straightforward -- definedness of result flags
+     depends on definedness of result value.
+
+   * inc/dec -- definedness of result flags depends only on
+     definedness of result.  This isn't really true -- it also depends
+     on the old C flag.  However, we don't want Memcheck to see that,
+     and so the old C flag must be passed in NDEP and not in DEP2.
+     It's inconceivable that a compiler would generate code that puts
+     the C flag in an undefined state, then does an inc/dec, which
+     leaves C unchanged, and then makes a conditional jump/move based
+     on C.  So our fiction seems a good approximation.
+
+   * shl/shr/sar -- straightforward, again, definedness of result
+     flags depends on definedness of result value.  The subshifted
+     value (value shifted one less) is also needed, but its
+     definedness is the same as the definedness of the shifted value.
+
+   * rol/ror -- these only set O and C, and leave A Z C P alone.
+     However it seems prudent (as per inc/dec) to say the definedness
+     of all resulting flags depends on the definedness of the result,
+     hence the old flags must go in as NDEP and not DEP2.
+
+   * rcl/rcr are too difficult to do in-line, and so are done by a
+     helper function.  They are not part of this scheme.  The helper
+     function takes the value to be rotated, the rotate amount and the
+     old flags, and returns the new flags and the rotated value.
+     Since the helper's mcx_mask does not have any set bits, Memcheck
+     will lazily propagate undefinedness from any of the 3 args into 
+     both results (flags and actual value).
+*/
+enum {
+    X86G_CC_OP_COPY=0,  /* DEP1 = current flags, DEP2 = 0, NDEP = unused */
+                        /* just copy DEP1 to output */
+
+    X86G_CC_OP_ADDB,    /* 1 */
+    X86G_CC_OP_ADDW,    /* 2 DEP1 = argL, DEP2 = argR, NDEP = unused */
+    X86G_CC_OP_ADDL,    /* 3 */
+
+    X86G_CC_OP_SUBB,    /* 4 */
+    X86G_CC_OP_SUBW,    /* 5 DEP1 = argL, DEP2 = argR, NDEP = unused */
+    X86G_CC_OP_SUBL,    /* 6 */
+
+    X86G_CC_OP_ADCB,    /* 7 */
+    X86G_CC_OP_ADCW,    /* 8 DEP1 = argL, DEP2 = argR ^ oldCarry, NDEP = oldCarry */
+    X86G_CC_OP_ADCL,    /* 9 */
+
+    X86G_CC_OP_SBBB,    /* 10 */
+    X86G_CC_OP_SBBW,    /* 11 DEP1 = argL, DEP2 = argR ^ oldCarry, NDEP = oldCarry */
+    X86G_CC_OP_SBBL,    /* 12 */
+
+    X86G_CC_OP_LOGICB,  /* 13 */
+    X86G_CC_OP_LOGICW,  /* 14 DEP1 = result, DEP2 = 0, NDEP = unused */
+    X86G_CC_OP_LOGICL,  /* 15 */
+
+    X86G_CC_OP_INCB,    /* 16 */
+    X86G_CC_OP_INCW,    /* 17 DEP1 = result, DEP2 = 0, NDEP = oldCarry (0 or 1) */
+    X86G_CC_OP_INCL,    /* 18 */
+
+    X86G_CC_OP_DECB,    /* 19 */
+    X86G_CC_OP_DECW,    /* 20 DEP1 = result, DEP2 = 0, NDEP = oldCarry (0 or 1) */
+    X86G_CC_OP_DECL,    /* 21 */
+
+    X86G_CC_OP_SHLB,    /* 22 DEP1 = res, DEP2 = res', NDEP = unused */
+    X86G_CC_OP_SHLW,    /* 23 where res' is like res but shifted one bit less */
+    X86G_CC_OP_SHLL,    /* 24 */
+
+    X86G_CC_OP_SHRB,    /* 25 DEP1 = res, DEP2 = res', NDEP = unused */
+    X86G_CC_OP_SHRW,    /* 26 where res' is like res but shifted one bit less */
+    X86G_CC_OP_SHRL,    /* 27 */
+
+    X86G_CC_OP_ROLB,    /* 28 */
+    X86G_CC_OP_ROLW,    /* 29 DEP1 = res, DEP2 = 0, NDEP = old flags */
+    X86G_CC_OP_ROLL,    /* 30 */
+
+    X86G_CC_OP_RORB,    /* 31 */
+    X86G_CC_OP_RORW,    /* 32 DEP1 = res, DEP2 = 0, NDEP = old flags */
+    X86G_CC_OP_RORL,    /* 33 */
+
+    X86G_CC_OP_UMULB,   /* 34 */
+    X86G_CC_OP_UMULW,   /* 35 DEP1 = argL, DEP2 = argR, NDEP = unused */
+    X86G_CC_OP_UMULL,   /* 36 */
+
+    X86G_CC_OP_SMULB,   /* 37 */
+    X86G_CC_OP_SMULW,   /* 38 DEP1 = argL, DEP2 = argR, NDEP = unused */
+    X86G_CC_OP_SMULL,   /* 39 */
+
+    X86G_CC_OP_NUMBER
+};
+
+typedef
+   enum {
+      X86CondO      = 0,  /* overflow           */
+      X86CondNO     = 1,  /* no overflow        */
+
+      X86CondB      = 2,  /* below              */
+      X86CondNB     = 3,  /* not below          */
+
+      X86CondZ      = 4,  /* zero               */
+      X86CondNZ     = 5,  /* not zero           */
+
+      X86CondBE     = 6,  /* below or equal     */
+      X86CondNBE    = 7,  /* not below or equal */
+
+      X86CondS      = 8,  /* negative           */
+      X86CondNS     = 9,  /* not negative       */
+
+      X86CondP      = 10, /* parity even        */
+      X86CondNP     = 11, /* not parity even    */
+
+      X86CondL      = 12, /* jump less          */
+      X86CondNL     = 13, /* not less           */
+
+      X86CondLE     = 14, /* less or equal      */
+      X86CondNLE    = 15, /* not less or equal  */
+
+      X86CondAlways = 16  /* HACK */
+   }
+   X86Condcode;
+
+#endif /* ndef __VEX_GUEST_X86_DEFS_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                    guest_x86_defs.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/guest_x86_helpers.c b/VEX/priv/guest_x86_helpers.c
new file mode 100644
index 0000000..7aa7a33
--- /dev/null
+++ b/VEX/priv/guest_x86_helpers.c

@@ -0,0 +1,2777 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                               guest_x86_helpers.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_emwarn.h"
+#include "libvex_guest_x86.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+
+#include "main_util.h"
+#include "guest_generic_bb_to_IR.h"
+#include "guest_x86_defs.h"
+#include "guest_generic_x87.h"
+
+
+/* This file contains helper functions for x86 guest code.
+   Calls to these functions are generated by the back end.
+   These calls are of course in the host machine code and 
+   this file will be compiled to host machine code, so that
+   all makes sense.  
+
+   Only change the signatures of these helper functions very
+   carefully.  If you change the signature here, you'll have to change
+   the parameters passed to it in the IR calls constructed by
+   guest-x86/toIR.c.
+
+   The convention used is that all functions called from generated
+   code are named x86g_<something>, and any function whose name lacks
+   that prefix is not called from generated code.  Note that some
+   LibVEX_* functions can however be called by VEX's client, but that
+   is not the same as calling them from VEX-generated code.
+*/
+
+
+/* Set to 1 to get detailed profiling info about use of the flag
+   machinery. */
+#define PROFILE_EFLAGS 0
+
+
+/*---------------------------------------------------------------*/
+/*--- %eflags run-time helpers.                               ---*/
+/*---------------------------------------------------------------*/
+
+static const UChar parity_table[256] = {
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0,
+    0, X86G_CC_MASK_P, X86G_CC_MASK_P, 0, X86G_CC_MASK_P, 0, 0, X86G_CC_MASK_P,
+};
+
+/* generalised left-shifter */
+inline static Int lshift ( Int x, Int n )
+{
+   if (n >= 0)
+      return x << n;
+   else
+      return x >> (-n);
+}
+
+/* identity on ULong */
+static inline ULong idULong ( ULong x )
+{
+   return x;
+}
+
+
+#define PREAMBLE(__data_bits)					\
+   /* const */ UInt DATA_MASK 					\
+      = __data_bits==8 ? 0xFF 					\
+                       : (__data_bits==16 ? 0xFFFF 		\
+                                          : 0xFFFFFFFF); 	\
+   /* const */ UInt SIGN_MASK = 1 << (__data_bits - 1);		\
+   /* const */ UInt CC_DEP1 = cc_dep1_formal;			\
+   /* const */ UInt CC_DEP2 = cc_dep2_formal;			\
+   /* const */ UInt CC_NDEP = cc_ndep_formal;			\
+   /* Four bogus assignments, which hopefully gcc can     */	\
+   /* optimise away, and which stop it complaining about  */	\
+   /* unused variables.                                   */	\
+   SIGN_MASK = SIGN_MASK;					\
+   DATA_MASK = DATA_MASK;					\
+   CC_DEP2 = CC_DEP2;						\
+   CC_NDEP = CC_NDEP;
+
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_ADD(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Int cf, pf, af, zf, sf, of;				\
+     Int argL, argR, res;					\
+     argL = CC_DEP1;						\
+     argR = CC_DEP2;						\
+     res  = argL + argR;					\
+     cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;			\
+     pf = parity_table[(UChar)res];				\
+     af = (res ^ argL ^ argR) & 0x10;				\
+     zf = ((DATA_UTYPE)res == 0) << 6;				\
+     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
+     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
+                 12 - DATA_BITS) & X86G_CC_MASK_O;		\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_SUB(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Int cf, pf, af, zf, sf, of;				\
+     Int argL, argR, res;					\
+     argL = CC_DEP1;						\
+     argR = CC_DEP2;						\
+     res  = argL - argR;					\
+     cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;			\
+     pf = parity_table[(UChar)res];				\
+     af = (res ^ argL ^ argR) & 0x10;				\
+     zf = ((DATA_UTYPE)res == 0) << 6;				\
+     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
+     of = lshift((argL ^ argR) & (argL ^ res),	 		\
+                 12 - DATA_BITS) & X86G_CC_MASK_O; 		\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_ADC(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Int cf, pf, af, zf, sf, of;				\
+     Int argL, argR, oldC, res;		       			\
+     oldC = CC_NDEP & X86G_CC_MASK_C;				\
+     argL = CC_DEP1;						\
+     argR = CC_DEP2 ^ oldC;	       				\
+     res  = (argL + argR) + oldC;				\
+     if (oldC)							\
+        cf = (DATA_UTYPE)res <= (DATA_UTYPE)argL;		\
+     else							\
+        cf = (DATA_UTYPE)res < (DATA_UTYPE)argL;		\
+     pf = parity_table[(UChar)res];				\
+     af = (res ^ argL ^ argR) & 0x10;				\
+     zf = ((DATA_UTYPE)res == 0) << 6;				\
+     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
+     of = lshift((argL ^ argR ^ -1) & (argL ^ res), 		\
+                  12 - DATA_BITS) & X86G_CC_MASK_O;		\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_SBB(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Int cf, pf, af, zf, sf, of;				\
+     Int argL, argR, oldC, res;		       			\
+     oldC = CC_NDEP & X86G_CC_MASK_C;				\
+     argL = CC_DEP1;						\
+     argR = CC_DEP2 ^ oldC;	       				\
+     res  = (argL - argR) - oldC;				\
+     if (oldC)							\
+        cf = (DATA_UTYPE)argL <= (DATA_UTYPE)argR;		\
+     else							\
+        cf = (DATA_UTYPE)argL < (DATA_UTYPE)argR;		\
+     pf = parity_table[(UChar)res];				\
+     af = (res ^ argL ^ argR) & 0x10;				\
+     zf = ((DATA_UTYPE)res == 0) << 6;				\
+     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
+     of = lshift((argL ^ argR) & (argL ^ res), 			\
+                 12 - DATA_BITS) & X86G_CC_MASK_O;		\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_LOGIC(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Int cf, pf, af, zf, sf, of;				\
+     cf = 0;							\
+     pf = parity_table[(UChar)CC_DEP1];				\
+     af = 0;							\
+     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
+     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
+     of = 0;							\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_INC(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Int cf, pf, af, zf, sf, of;				\
+     Int argL, argR, res;					\
+     res  = CC_DEP1;						\
+     argL = res - 1;						\
+     argR = 1;							\
+     cf = CC_NDEP & X86G_CC_MASK_C;				\
+     pf = parity_table[(UChar)res];				\
+     af = (res ^ argL ^ argR) & 0x10;				\
+     zf = ((DATA_UTYPE)res == 0) << 6;				\
+     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
+     of = ((res & DATA_MASK) == SIGN_MASK) << 11;		\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_DEC(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Int cf, pf, af, zf, sf, of;				\
+     Int argL, argR, res;					\
+     res  = CC_DEP1;						\
+     argL = res + 1;						\
+     argR = 1;							\
+     cf = CC_NDEP & X86G_CC_MASK_C;				\
+     pf = parity_table[(UChar)res];				\
+     af = (res ^ argL ^ argR) & 0x10;				\
+     zf = ((DATA_UTYPE)res == 0) << 6;				\
+     sf = lshift(res, 8 - DATA_BITS) & 0x80;			\
+     of = ((res & DATA_MASK) 					\
+          == ((UInt)SIGN_MASK - 1)) << 11;			\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_SHL(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Int cf, pf, af, zf, sf, of;				\
+     cf = (CC_DEP2 >> (DATA_BITS - 1)) & X86G_CC_MASK_C;	\
+     pf = parity_table[(UChar)CC_DEP1];				\
+     af = 0; /* undefined */					\
+     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
+     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
+     /* of is defined if shift count == 1 */			\
+     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS) 		\
+          & X86G_CC_MASK_O;					\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_SHR(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);  					\
+   { Int cf, pf, af, zf, sf, of;				\
+     cf = CC_DEP2 & 1;						\
+     pf = parity_table[(UChar)CC_DEP1];				\
+     af = 0; /* undefined */					\
+     zf = ((DATA_UTYPE)CC_DEP1 == 0) << 6;			\
+     sf = lshift(CC_DEP1, 8 - DATA_BITS) & 0x80;		\
+     /* of is defined if shift count == 1 */			\
+     of = lshift(CC_DEP2 ^ CC_DEP1, 12 - DATA_BITS)		\
+          & X86G_CC_MASK_O;					\
+     return cf | pf | af | zf | sf | of;			\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+/* ROL: cf' = lsb(result).  of' = msb(result) ^ lsb(result). */
+/* DEP1 = result, NDEP = old flags */
+#define ACTIONS_ROL(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Int fl 							\
+        = (CC_NDEP & ~(X86G_CC_MASK_O | X86G_CC_MASK_C))	\
+          | (X86G_CC_MASK_C & CC_DEP1)				\
+          | (X86G_CC_MASK_O & (lshift(CC_DEP1,  		\
+                                      11-(DATA_BITS-1)) 	\
+                     ^ lshift(CC_DEP1, 11)));			\
+     return fl;							\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+/* ROR: cf' = msb(result).  of' = msb(result) ^ msb-1(result). */
+/* DEP1 = result, NDEP = old flags */
+#define ACTIONS_ROR(DATA_BITS,DATA_UTYPE)			\
+{								\
+   PREAMBLE(DATA_BITS);						\
+   { Int fl 							\
+        = (CC_NDEP & ~(X86G_CC_MASK_O | X86G_CC_MASK_C))	\
+          | (X86G_CC_MASK_C & (CC_DEP1 >> (DATA_BITS-1)))	\
+          | (X86G_CC_MASK_O & (lshift(CC_DEP1, 			\
+                                      11-(DATA_BITS-1)) 	\
+                     ^ lshift(CC_DEP1, 11-(DATA_BITS-1)+1)));	\
+     return fl;							\
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_UMUL(DATA_BITS, DATA_UTYPE,  NARROWtoU,         \
+                                DATA_U2TYPE, NARROWto2U)        \
+{                                                               \
+   PREAMBLE(DATA_BITS);                                         \
+   { Int cf, pf, af, zf, sf, of;                                \
+     DATA_UTYPE  hi;                                            \
+     DATA_UTYPE  lo                                             \
+        = NARROWtoU( ((DATA_UTYPE)CC_DEP1)                      \
+                     * ((DATA_UTYPE)CC_DEP2) );                 \
+     DATA_U2TYPE rr                                             \
+        = NARROWto2U(                                           \
+             ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP1))               \
+             * ((DATA_U2TYPE)((DATA_UTYPE)CC_DEP2)) );          \
+     hi = NARROWtoU(rr >>/*u*/ DATA_BITS);                      \
+     cf = (hi != 0);                                            \
+     pf = parity_table[(UChar)lo];                              \
+     af = 0; /* undefined */                                    \
+     zf = (lo == 0) << 6;                                       \
+     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
+     of = cf << 11;                                             \
+     return cf | pf | af | zf | sf | of;                        \
+   }								\
+}
+
+/*-------------------------------------------------------------*/
+
+#define ACTIONS_SMUL(DATA_BITS, DATA_STYPE,  NARROWtoS,         \
+                                DATA_S2TYPE, NARROWto2S)        \
+{                                                               \
+   PREAMBLE(DATA_BITS);                                         \
+   { Int cf, pf, af, zf, sf, of;                                \
+     DATA_STYPE  hi;                                            \
+     DATA_STYPE  lo                                             \
+        = NARROWtoS( ((DATA_STYPE)CC_DEP1)                      \
+                     * ((DATA_STYPE)CC_DEP2) );                 \
+     DATA_S2TYPE rr                                             \
+        = NARROWto2S(                                           \
+             ((DATA_S2TYPE)((DATA_STYPE)CC_DEP1))               \
+             * ((DATA_S2TYPE)((DATA_STYPE)CC_DEP2)) );          \
+     hi = NARROWtoS(rr >>/*s*/ DATA_BITS);                      \
+     cf = (hi != (lo >>/*s*/ (DATA_BITS-1)));                   \
+     pf = parity_table[(UChar)lo];                              \
+     af = 0; /* undefined */                                    \
+     zf = (lo == 0) << 6;                                       \
+     sf = lshift(lo, 8 - DATA_BITS) & 0x80;                     \
+     of = cf << 11;                                             \
+     return cf | pf | af | zf | sf | of;                        \
+   }								\
+}
+
+
+#if PROFILE_EFLAGS
+
+static Bool initted     = False;
+
+/* C flag, fast route */
+static UInt tabc_fast[X86G_CC_OP_NUMBER];
+/* C flag, slow route */
+static UInt tabc_slow[X86G_CC_OP_NUMBER];
+/* table for calculate_cond */
+static UInt tab_cond[X86G_CC_OP_NUMBER][16];
+/* total entry counts for calc_all, calc_c, calc_cond. */
+static UInt n_calc_all  = 0;
+static UInt n_calc_c    = 0;
+static UInt n_calc_cond = 0;
+
+#define SHOW_COUNTS_NOW (0 == (0x3FFFFF & (n_calc_all+n_calc_c+n_calc_cond)))
+
+
+static void showCounts ( void )
+{
+   Int op, co;
+   Char ch;
+   vex_printf("\nTotal calls: calc_all=%u   calc_cond=%u   calc_c=%u\n",
+              n_calc_all, n_calc_cond, n_calc_c);
+
+   vex_printf("      cSLOW  cFAST    O   NO    B   NB    Z   NZ   BE  NBE"
+              "    S   NS    P   NP    L   NL   LE  NLE\n");
+   vex_printf("     -----------------------------------------------------"
+              "----------------------------------------\n");
+   for (op = 0; op < X86G_CC_OP_NUMBER; op++) {
+
+      ch = ' ';
+      if (op > 0 && (op-1) % 3 == 0) 
+         ch = 'B';
+      if (op > 0 && (op-1) % 3 == 1) 
+         ch = 'W';
+      if (op > 0 && (op-1) % 3 == 2) 
+         ch = 'L';
+
+      vex_printf("%2d%c: ", op, ch);
+      vex_printf("%6u ", tabc_slow[op]);
+      vex_printf("%6u ", tabc_fast[op]);
+      for (co = 0; co < 16; co++) {
+         Int n = tab_cond[op][co];
+         if (n >= 1000) {
+            vex_printf(" %3dK", n / 1000);
+         } else 
+         if (n >= 0) {
+            vex_printf(" %3d ", n );
+         } else {
+            vex_printf("     ");
+         }
+      }
+      vex_printf("\n");
+   }
+   vex_printf("\n");
+}
+
+static void initCounts ( void )
+{
+   Int op, co;
+   initted = True;
+   for (op = 0; op < X86G_CC_OP_NUMBER; op++) {
+      tabc_fast[op] = tabc_slow[op] = 0;
+      for (co = 0; co < 16; co++)
+         tab_cond[op][co] = 0;
+   }
+}
+
+#endif /* PROFILE_EFLAGS */
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate all the 6 flags from the supplied thunk parameters.
+   Worker function, not directly called from generated code. */
+static
+UInt x86g_calculate_eflags_all_WRK ( UInt cc_op, 
+                                     UInt cc_dep1_formal, 
+                                     UInt cc_dep2_formal,
+                                     UInt cc_ndep_formal )
+{
+   switch (cc_op) {
+      case X86G_CC_OP_COPY:
+         return cc_dep1_formal
+                & (X86G_CC_MASK_O | X86G_CC_MASK_S | X86G_CC_MASK_Z 
+                   | X86G_CC_MASK_A | X86G_CC_MASK_C | X86G_CC_MASK_P);
+
+      case X86G_CC_OP_ADDB:   ACTIONS_ADD( 8,  UChar  );
+      case X86G_CC_OP_ADDW:   ACTIONS_ADD( 16, UShort );
+      case X86G_CC_OP_ADDL:   ACTIONS_ADD( 32, UInt   );
+
+      case X86G_CC_OP_ADCB:   ACTIONS_ADC( 8,  UChar  );
+      case X86G_CC_OP_ADCW:   ACTIONS_ADC( 16, UShort );
+      case X86G_CC_OP_ADCL:   ACTIONS_ADC( 32, UInt   );
+
+      case X86G_CC_OP_SUBB:   ACTIONS_SUB(  8, UChar  );
+      case X86G_CC_OP_SUBW:   ACTIONS_SUB( 16, UShort );
+      case X86G_CC_OP_SUBL:   ACTIONS_SUB( 32, UInt   );
+
+      case X86G_CC_OP_SBBB:   ACTIONS_SBB(  8, UChar  );
+      case X86G_CC_OP_SBBW:   ACTIONS_SBB( 16, UShort );
+      case X86G_CC_OP_SBBL:   ACTIONS_SBB( 32, UInt   );
+
+      case X86G_CC_OP_LOGICB: ACTIONS_LOGIC(  8, UChar  );
+      case X86G_CC_OP_LOGICW: ACTIONS_LOGIC( 16, UShort );
+      case X86G_CC_OP_LOGICL: ACTIONS_LOGIC( 32, UInt   );
+
+      case X86G_CC_OP_INCB:   ACTIONS_INC(  8, UChar  );
+      case X86G_CC_OP_INCW:   ACTIONS_INC( 16, UShort );
+      case X86G_CC_OP_INCL:   ACTIONS_INC( 32, UInt   );
+
+      case X86G_CC_OP_DECB:   ACTIONS_DEC(  8, UChar  );
+      case X86G_CC_OP_DECW:   ACTIONS_DEC( 16, UShort );
+      case X86G_CC_OP_DECL:   ACTIONS_DEC( 32, UInt   );
+
+      case X86G_CC_OP_SHLB:   ACTIONS_SHL(  8, UChar  );
+      case X86G_CC_OP_SHLW:   ACTIONS_SHL( 16, UShort );
+      case X86G_CC_OP_SHLL:   ACTIONS_SHL( 32, UInt   );
+
+      case X86G_CC_OP_SHRB:   ACTIONS_SHR(  8, UChar  );
+      case X86G_CC_OP_SHRW:   ACTIONS_SHR( 16, UShort );
+      case X86G_CC_OP_SHRL:   ACTIONS_SHR( 32, UInt   );
+
+      case X86G_CC_OP_ROLB:   ACTIONS_ROL(  8, UChar  );
+      case X86G_CC_OP_ROLW:   ACTIONS_ROL( 16, UShort );
+      case X86G_CC_OP_ROLL:   ACTIONS_ROL( 32, UInt   );
+
+      case X86G_CC_OP_RORB:   ACTIONS_ROR(  8, UChar  );
+      case X86G_CC_OP_RORW:   ACTIONS_ROR( 16, UShort );
+      case X86G_CC_OP_RORL:   ACTIONS_ROR( 32, UInt   );
+
+      case X86G_CC_OP_UMULB:  ACTIONS_UMUL(  8, UChar,  toUChar,
+                                                UShort, toUShort );
+      case X86G_CC_OP_UMULW:  ACTIONS_UMUL( 16, UShort, toUShort,
+                                                UInt,   toUInt );
+      case X86G_CC_OP_UMULL:  ACTIONS_UMUL( 32, UInt,   toUInt,
+                                                ULong,  idULong );
+
+      case X86G_CC_OP_SMULB:  ACTIONS_SMUL(  8, Char,   toUChar,
+                                                Short,  toUShort );
+      case X86G_CC_OP_SMULW:  ACTIONS_SMUL( 16, Short,  toUShort, 
+                                                Int,    toUInt   );
+      case X86G_CC_OP_SMULL:  ACTIONS_SMUL( 32, Int,    toUInt,
+                                                Long,   idULong );
+
+      default:
+         /* shouldn't really make these calls from generated code */
+         vex_printf("x86g_calculate_eflags_all_WRK(X86)"
+                    "( %u, 0x%x, 0x%x, 0x%x )\n",
+                    cc_op, cc_dep1_formal, cc_dep2_formal, cc_ndep_formal );
+         vpanic("x86g_calculate_eflags_all_WRK(X86)");
+   }
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate all the 6 flags from the supplied thunk parameters. */
+UInt x86g_calculate_eflags_all ( UInt cc_op, 
+                                 UInt cc_dep1, 
+                                 UInt cc_dep2,
+                                 UInt cc_ndep )
+{
+#  if PROFILE_EFLAGS
+   if (!initted) initCounts();
+   n_calc_all++;
+   if (SHOW_COUNTS_NOW) showCounts();
+#  endif
+   return
+      x86g_calculate_eflags_all_WRK ( cc_op, cc_dep1, cc_dep2, cc_ndep );
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate just the carry flag from the supplied thunk parameters. */
+__attribute((regparm(3)))
+UInt x86g_calculate_eflags_c ( UInt cc_op, 
+                               UInt cc_dep1, 
+                               UInt cc_dep2,
+                               UInt cc_ndep )
+{
+#  if PROFILE_EFLAGS
+   if (!initted) initCounts();
+   n_calc_c++;
+   tabc_fast[cc_op]++;
+   if (SHOW_COUNTS_NOW) showCounts();
+#  endif
+
+   /* Fast-case some common ones. */
+   switch (cc_op) {
+      case X86G_CC_OP_LOGICL: 
+      case X86G_CC_OP_LOGICW: 
+      case X86G_CC_OP_LOGICB:
+         return 0;
+      case X86G_CC_OP_SUBL:
+         return ((UInt)cc_dep1) < ((UInt)cc_dep2)
+                   ? X86G_CC_MASK_C : 0;
+      case X86G_CC_OP_SUBW:
+         return ((UInt)(cc_dep1 & 0xFFFF)) < ((UInt)(cc_dep2 & 0xFFFF))
+                   ? X86G_CC_MASK_C : 0;
+      case X86G_CC_OP_SUBB:
+         return ((UInt)(cc_dep1 & 0xFF)) < ((UInt)(cc_dep2 & 0xFF))
+                   ? X86G_CC_MASK_C : 0;
+      case X86G_CC_OP_INCL:
+      case X86G_CC_OP_DECL:
+         return cc_ndep & X86G_CC_MASK_C;
+      default: 
+         break;
+   }
+
+#  if PROFILE_EFLAGS
+   tabc_fast[cc_op]--;
+   tabc_slow[cc_op]++;
+#  endif
+
+   return x86g_calculate_eflags_all_WRK(cc_op,cc_dep1,cc_dep2,cc_ndep) 
+          & X86G_CC_MASK_C;
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* returns 1 or 0 */
+UInt x86g_calculate_condition ( UInt/*X86Condcode*/ cond, 
+                                UInt cc_op, 
+                                UInt cc_dep1, 
+                                UInt cc_dep2,
+                                UInt cc_ndep )
+{
+   UInt eflags = x86g_calculate_eflags_all_WRK(cc_op, cc_dep1, 
+                                               cc_dep2, cc_ndep);
+   UInt of,sf,zf,cf,pf;
+   UInt inv = cond & 1;
+
+#  if PROFILE_EFLAGS
+   if (!initted) initCounts();
+   tab_cond[cc_op][cond]++;
+   n_calc_cond++;
+   if (SHOW_COUNTS_NOW) showCounts();
+#  endif
+
+   switch (cond) {
+      case X86CondNO:
+      case X86CondO: /* OF == 1 */
+         of = eflags >> X86G_CC_SHIFT_O;
+         return 1 & (inv ^ of);
+
+      case X86CondNZ:
+      case X86CondZ: /* ZF == 1 */
+         zf = eflags >> X86G_CC_SHIFT_Z;
+         return 1 & (inv ^ zf);
+
+      case X86CondNB:
+      case X86CondB: /* CF == 1 */
+         cf = eflags >> X86G_CC_SHIFT_C;
+         return 1 & (inv ^ cf);
+         break;
+
+      case X86CondNBE:
+      case X86CondBE: /* (CF or ZF) == 1 */
+         cf = eflags >> X86G_CC_SHIFT_C;
+         zf = eflags >> X86G_CC_SHIFT_Z;
+         return 1 & (inv ^ (cf | zf));
+         break;
+
+      case X86CondNS:
+      case X86CondS: /* SF == 1 */
+         sf = eflags >> X86G_CC_SHIFT_S;
+         return 1 & (inv ^ sf);
+
+      case X86CondNP:
+      case X86CondP: /* PF == 1 */
+         pf = eflags >> X86G_CC_SHIFT_P;
+         return 1 & (inv ^ pf);
+
+      case X86CondNL:
+      case X86CondL: /* (SF xor OF) == 1 */
+         sf = eflags >> X86G_CC_SHIFT_S;
+         of = eflags >> X86G_CC_SHIFT_O;
+         return 1 & (inv ^ (sf ^ of));
+         break;
+
+      case X86CondNLE:
+      case X86CondLE: /* ((SF xor OF) or ZF)  == 1 */
+         sf = eflags >> X86G_CC_SHIFT_S;
+         of = eflags >> X86G_CC_SHIFT_O;
+         zf = eflags >> X86G_CC_SHIFT_Z;
+         return 1 & (inv ^ ((sf ^ of) | zf));
+         break;
+
+      default:
+         /* shouldn't really make these calls from generated code */
+         vex_printf("x86g_calculate_condition( %u, %u, 0x%x, 0x%x, 0x%x )\n",
+                    cond, cc_op, cc_dep1, cc_dep2, cc_ndep );
+         vpanic("x86g_calculate_condition");
+   }
+}
+
+
+/* VISIBLE TO LIBVEX CLIENT */
+UInt LibVEX_GuestX86_get_eflags ( /*IN*/VexGuestX86State* vex_state )
+{
+   UInt eflags = x86g_calculate_eflags_all_WRK(
+                    vex_state->guest_CC_OP,
+                    vex_state->guest_CC_DEP1,
+                    vex_state->guest_CC_DEP2,
+                    vex_state->guest_CC_NDEP
+                 );
+   UInt dflag = vex_state->guest_DFLAG;
+   vassert(dflag == 1 || dflag == 0xFFFFFFFF);
+   if (dflag == 0xFFFFFFFF)
+      eflags |= (1<<10);
+   if (vex_state->guest_IDFLAG == 1)
+      eflags |= (1<<21);
+   if (vex_state->guest_ACFLAG == 1)
+      eflags |= (1<<18);
+					     
+   return eflags;
+}
+
+/* VISIBLE TO LIBVEX CLIENT */
+void
+LibVEX_GuestX86_put_eflag_c ( UInt new_carry_flag,
+                              /*MOD*/VexGuestX86State* vex_state )
+{
+   UInt oszacp = x86g_calculate_eflags_all_WRK(
+                    vex_state->guest_CC_OP,
+                    vex_state->guest_CC_DEP1,
+                    vex_state->guest_CC_DEP2,
+                    vex_state->guest_CC_NDEP
+                 );
+   if (new_carry_flag & 1) {
+      oszacp |= X86G_CC_MASK_C;
+   } else {
+      oszacp &= ~X86G_CC_MASK_C;
+   }
+   vex_state->guest_CC_OP   = X86G_CC_OP_COPY;
+   vex_state->guest_CC_DEP1 = oszacp;
+   vex_state->guest_CC_DEP2 = 0;
+   vex_state->guest_CC_NDEP = 0;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- %eflags translation-time function specialisers.         ---*/
+/*--- These help iropt specialise calls the above run-time    ---*/
+/*--- %eflags functions.                                      ---*/
+/*---------------------------------------------------------------*/
+
+/* Used by the optimiser to try specialisations.  Returns an
+   equivalent expression, or NULL if none. */
+
+static inline Bool isU32 ( IRExpr* e, UInt n )
+{
+   return 
+      toBool( e->tag == Iex_Const
+              && e->Iex.Const.con->tag == Ico_U32
+              && e->Iex.Const.con->Ico.U32 == n );
+}
+
+IRExpr* guest_x86_spechelper ( HChar*   function_name,
+                               IRExpr** args,
+                               IRStmt** precedingStmts,
+                               Int      n_precedingStmts )
+{
+#  define unop(_op,_a1) IRExpr_Unop((_op),(_a1))
+#  define binop(_op,_a1,_a2) IRExpr_Binop((_op),(_a1),(_a2))
+#  define mkU32(_n) IRExpr_Const(IRConst_U32(_n))
+#  define mkU8(_n)  IRExpr_Const(IRConst_U8(_n))
+
+   Int i, arity = 0;
+   for (i = 0; args[i]; i++)
+      arity++;
+#  if 0
+   vex_printf("spec request:\n");
+   vex_printf("   %s  ", function_name);
+   for (i = 0; i < arity; i++) {
+      vex_printf("  ");
+      ppIRExpr(args[i]);
+   }
+   vex_printf("\n");
+#  endif
+
+   /* --------- specialising "x86g_calculate_condition" --------- */
+
+   if (vex_streq(function_name, "x86g_calculate_condition")) {
+      /* specialise calls to above "calculate condition" function */
+      IRExpr *cond, *cc_op, *cc_dep1, *cc_dep2;
+      vassert(arity == 5);
+      cond    = args[0];
+      cc_op   = args[1];
+      cc_dep1 = args[2];
+      cc_dep2 = args[3];
+
+      /*---------------- ADDL ----------------*/
+
+      if (isU32(cc_op, X86G_CC_OP_ADDL) && isU32(cond, X86CondZ)) {
+         /* long add, then Z --> test (dst+src == 0) */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpEQ32, 
+                           binop(Iop_Add32, cc_dep1, cc_dep2),
+                           mkU32(0)));
+      }
+
+      /*---------------- SUBL ----------------*/
+
+      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondZ)) {
+         /* long sub/cmp, then Z --> test dst==src */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpEQ32, cc_dep1, cc_dep2));
+      }
+      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNZ)) {
+         /* long sub/cmp, then NZ --> test dst!=src */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpNE32, cc_dep1, cc_dep2));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondL)) {
+         /* long sub/cmp, then L (signed less than) 
+            --> test dst <s src */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLT32S, cc_dep1, cc_dep2));
+      }
+      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNL)) {
+         /* long sub/cmp, then NL (signed greater than or equal) 
+            --> test !(dst <s src) */
+         return binop(Iop_Xor32,
+                      unop(Iop_1Uto32,
+                           binop(Iop_CmpLT32S, cc_dep1, cc_dep2)),
+                      mkU32(1));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondLE)) {
+         /* long sub/cmp, then LE (signed less than or equal)
+            --> test dst <=s src */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLE32S, cc_dep1, cc_dep2));
+      }
+      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNLE)) {
+         /* long sub/cmp, then NLE (signed not less than or equal)
+            --> test dst >s src 
+            --> test !(dst <=s src) */
+         return binop(Iop_Xor32,
+                      unop(Iop_1Uto32,
+                           binop(Iop_CmpLE32S, cc_dep1, cc_dep2)),
+                      mkU32(1));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondBE)) {
+         /* long sub/cmp, then BE (unsigned less than or equal)
+            --> test dst <=u src */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLE32U, cc_dep1, cc_dep2));
+      }
+      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNBE)) {
+         /* long sub/cmp, then BE (unsigned greater than)
+            --> test !(dst <=u src) */
+         return binop(Iop_Xor32,
+                      unop(Iop_1Uto32,
+                           binop(Iop_CmpLE32U, cc_dep1, cc_dep2)),
+                      mkU32(1));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondB)) {
+         /* long sub/cmp, then B (unsigned less than)
+            --> test dst <u src */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLT32U, cc_dep1, cc_dep2));
+      }
+      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNB)) {
+         /* long sub/cmp, then NB (unsigned greater than or equal)
+            --> test !(dst <u src) */
+         return binop(Iop_Xor32,
+                      unop(Iop_1Uto32,
+                           binop(Iop_CmpLT32U, cc_dep1, cc_dep2)),
+                      mkU32(1));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondS)) {
+         /* long sub/cmp, then S (negative) --> test (dst-src <s 0) */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLT32S, 
+                           binop(Iop_Sub32, cc_dep1, cc_dep2),
+                           mkU32(0)));
+      }
+      if (isU32(cc_op, X86G_CC_OP_SUBL) && isU32(cond, X86CondNS)) {
+         /* long sub/cmp, then NS (not negative) --> test !(dst-src <s 0) */
+         return binop(Iop_Xor32,
+                      unop(Iop_1Uto32,
+                           binop(Iop_CmpLT32S, 
+                                 binop(Iop_Sub32, cc_dep1, cc_dep2),
+                                 mkU32(0))),
+                      mkU32(1));
+      }
+
+      /*---------------- SUBW ----------------*/
+
+      if (isU32(cc_op, X86G_CC_OP_SUBW) && isU32(cond, X86CondZ)) {
+         /* word sub/cmp, then Z --> test dst==src */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpEQ16, 
+                           unop(Iop_32to16,cc_dep1), 
+                           unop(Iop_32to16,cc_dep2)));
+      }
+      if (isU32(cc_op, X86G_CC_OP_SUBW) && isU32(cond, X86CondNZ)) {
+         /* word sub/cmp, then NZ --> test dst!=src */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpNE16, 
+                           unop(Iop_32to16,cc_dep1), 
+                           unop(Iop_32to16,cc_dep2)));
+      }
+
+      /*---------------- SUBB ----------------*/
+
+      if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondZ)) {
+         /* byte sub/cmp, then Z --> test dst==src */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpEQ8, 
+                           unop(Iop_32to8,cc_dep1), 
+                           unop(Iop_32to8,cc_dep2)));
+      }
+      if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNZ)) {
+         /* byte sub/cmp, then NZ --> test dst!=src */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpNE8, 
+                           unop(Iop_32to8,cc_dep1), 
+                           unop(Iop_32to8,cc_dep2)));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNBE)) {
+         /* byte sub/cmp, then NBE (unsigned greater than)
+            --> test src <u dst */
+         /* Note, args are opposite way round from the usual */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLT32U, 
+                           binop(Iop_And32,cc_dep2,mkU32(0xFF)),
+			   binop(Iop_And32,cc_dep1,mkU32(0xFF))));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondS)
+                                        && isU32(cc_dep2, 0)) {
+         /* byte sub/cmp of zero, then S --> test (dst-0 <s 0) 
+                                         --> test dst <s 0
+                                         --> (UInt)dst[7] 
+            This is yet another scheme by which gcc figures out if the
+            top bit of a byte is 1 or 0.  See also LOGICB/CondS below. */
+         /* Note: isU32(cc_dep2, 0) is correct, even though this is
+            for an 8-bit comparison, since the args to the helper
+            function are always U32s. */
+         return binop(Iop_And32,
+                      binop(Iop_Shr32,cc_dep1,mkU8(7)),
+                      mkU32(1));
+      }
+      if (isU32(cc_op, X86G_CC_OP_SUBB) && isU32(cond, X86CondNS)
+                                        && isU32(cc_dep2, 0)) {
+         /* byte sub/cmp of zero, then NS --> test !(dst-0 <s 0) 
+                                          --> test !(dst <s 0)
+                                          --> (UInt) !dst[7] 
+         */
+         return binop(Iop_Xor32,
+                      binop(Iop_And32,
+                            binop(Iop_Shr32,cc_dep1,mkU8(7)),
+                            mkU32(1)),
+                mkU32(1));
+      }
+
+      /*---------------- LOGICL ----------------*/
+
+      if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondZ)) {
+         /* long and/or/xor, then Z --> test dst==0 */
+         return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
+      }
+      if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondNZ)) {
+         /* long and/or/xor, then NZ --> test dst!=0 */
+         return unop(Iop_1Uto32,binop(Iop_CmpNE32, cc_dep1, mkU32(0)));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondLE)) {
+         /* long and/or/xor, then LE
+            This is pretty subtle.  LOGIC sets SF and ZF according to the
+            result and makes OF be zero.  LE computes (SZ ^ OF) | ZF, but
+            OF is zero, so this reduces to SZ | ZF -- which will be 1 iff
+            the result is <=signed 0.  Hence ...
+         */
+         return unop(Iop_1Uto32,binop(Iop_CmpLE32S, cc_dep1, mkU32(0)));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondBE)) {
+         /* long and/or/xor, then BE
+            LOGIC sets ZF according to the result and makes CF be zero.
+            BE computes (CF | ZF), but CF is zero, so this reduces ZF 
+            -- which will be 1 iff the result is zero.  Hence ...
+         */
+         return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondS)) {
+         /* see comment below for (LOGICB, CondS) */
+         /* long and/or/xor, then S --> (UInt)result[31] */
+         return binop(Iop_And32,
+                      binop(Iop_Shr32,cc_dep1,mkU8(31)),
+                      mkU32(1));
+      }
+      if (isU32(cc_op, X86G_CC_OP_LOGICL) && isU32(cond, X86CondNS)) {
+         /* see comment below for (LOGICB, CondNS) */
+         /* long and/or/xor, then S --> (UInt) ~ result[31] */
+         return binop(Iop_Xor32,
+                binop(Iop_And32,
+                      binop(Iop_Shr32,cc_dep1,mkU8(31)),
+                      mkU32(1)),
+                mkU32(1));
+      }
+
+      /*---------------- LOGICW ----------------*/
+
+      if (isU32(cc_op, X86G_CC_OP_LOGICW) && isU32(cond, X86CondZ)) {
+         /* word and/or/xor, then Z --> test dst==0 */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpEQ32, binop(Iop_And32,cc_dep1,mkU32(0xFFFF)), 
+                                        mkU32(0)));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_LOGICW) && isU32(cond, X86CondS)) {
+         /* see comment below for (LOGICB, CondS) */
+         /* word and/or/xor, then S --> (UInt)result[15] */
+         return binop(Iop_And32,
+                      binop(Iop_Shr32,cc_dep1,mkU8(15)),
+                      mkU32(1));
+      }
+
+      /*---------------- LOGICB ----------------*/
+
+      if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondZ)) {
+         /* byte and/or/xor, then Z --> test dst==0 */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpEQ32, binop(Iop_And32,cc_dep1,mkU32(255)), 
+                                        mkU32(0)));
+      }
+      if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondNZ)) {
+         /* byte and/or/xor, then Z --> test dst!=0 */
+         /* b9ac9:       84 c0                   test   %al,%al
+            b9acb:       75 0d                   jne    b9ada */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpNE32, binop(Iop_And32,cc_dep1,mkU32(255)), 
+                                        mkU32(0)));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondS)) {
+         /* this is an idiom gcc sometimes uses to find out if the top
+            bit of a byte register is set: eg testb %al,%al; js ..
+            Since it just depends on the top bit of the byte, extract
+            that bit and explicitly get rid of all the rest.  This
+            helps memcheck avoid false positives in the case where any
+            of the other bits in the byte are undefined. */
+         /* byte and/or/xor, then S --> (UInt)result[7] */
+         return binop(Iop_And32,
+                      binop(Iop_Shr32,cc_dep1,mkU8(7)),
+                      mkU32(1));
+      }
+      if (isU32(cc_op, X86G_CC_OP_LOGICB) && isU32(cond, X86CondNS)) {
+         /* ditto, for negation-of-S. */
+         /* byte and/or/xor, then S --> (UInt) ~ result[7] */
+         return binop(Iop_Xor32,
+                binop(Iop_And32,
+                      binop(Iop_Shr32,cc_dep1,mkU8(7)),
+                      mkU32(1)),
+                mkU32(1));
+      }
+
+      /*---------------- DECL ----------------*/
+
+      if (isU32(cc_op, X86G_CC_OP_DECL) && isU32(cond, X86CondZ)) {
+         /* dec L, then Z --> test dst == 0 */
+         return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_DECL) && isU32(cond, X86CondS)) {
+         /* dec L, then S --> compare DST <s 0 */
+         return unop(Iop_1Uto32,binop(Iop_CmpLT32S, cc_dep1, mkU32(0)));
+      }
+
+      /*---------------- DECW ----------------*/
+
+      if (isU32(cc_op, X86G_CC_OP_DECW) && isU32(cond, X86CondZ)) {
+         /* dec W, then Z --> test dst == 0 */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpEQ32, 
+                           binop(Iop_Shl32,cc_dep1,mkU8(16)), 
+                           mkU32(0)));
+      }
+
+      /*---------------- INCW ----------------*/
+
+      if (isU32(cc_op, X86G_CC_OP_INCW) && isU32(cond, X86CondZ)) {
+         /* This rewrite helps memcheck on 'incw %ax ; je ...'. */
+         /* inc W, then Z --> test dst == 0 */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpEQ32, 
+                           binop(Iop_Shl32,cc_dep1,mkU8(16)),
+                           mkU32(0)));
+      }
+
+      /*---------------- SHRL ----------------*/
+
+      if (isU32(cc_op, X86G_CC_OP_SHRL) && isU32(cond, X86CondZ)) {
+         /* SHRL, then Z --> test dep1 == 0 */
+         return unop(Iop_1Uto32,binop(Iop_CmpEQ32, cc_dep1, mkU32(0)));
+      }
+
+      /*---------------- COPY ----------------*/
+      /* This can happen, as a result of x87 FP compares: "fcom ... ;
+         fnstsw %ax ; sahf ; jbe" for example. */
+
+      if (isU32(cc_op, X86G_CC_OP_COPY) && 
+          (isU32(cond, X86CondBE) || isU32(cond, X86CondNBE))) {
+         /* COPY, then BE --> extract C and Z from dep1, and test 
+            (C or Z) == 1. */
+         /* COPY, then NBE --> extract C and Z from dep1, and test
+            (C or Z) == 0. */
+         UInt nnn = isU32(cond, X86CondBE) ? 1 : 0;
+         return
+            unop(
+               Iop_1Uto32,
+               binop(
+                  Iop_CmpEQ32,
+                  binop(
+                     Iop_And32,
+                     binop(
+                        Iop_Or32,
+                        binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_C)),
+                        binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_Z))
+                     ),
+                     mkU32(1)
+                  ),
+                  mkU32(nnn)
+               )
+            );
+      }
+      
+      if (isU32(cc_op, X86G_CC_OP_COPY) 
+          && (isU32(cond, X86CondB) || isU32(cond, X86CondNB))) {
+         /* COPY, then B --> extract C from dep1, and test (C == 1). */
+         /* COPY, then NB --> extract C from dep1, and test (C == 0). */
+         UInt nnn = isU32(cond, X86CondB) ? 1 : 0;
+         return
+            unop(
+               Iop_1Uto32,
+               binop(
+                  Iop_CmpEQ32,
+                  binop(
+                     Iop_And32,
+                     binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_C)),
+                     mkU32(1)
+                  ),
+                  mkU32(nnn)
+               )
+            );
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_COPY) 
+          && (isU32(cond, X86CondZ) || isU32(cond, X86CondNZ))) {
+         /* COPY, then Z --> extract Z from dep1, and test (Z == 1). */
+         /* COPY, then NZ --> extract Z from dep1, and test (Z == 0). */
+         UInt nnn = isU32(cond, X86CondZ) ? 1 : 0;
+         return
+            unop(
+               Iop_1Uto32,
+               binop(
+                  Iop_CmpEQ32,
+                  binop(
+                     Iop_And32,
+                     binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_Z)),
+                     mkU32(1)
+                  ),
+                  mkU32(nnn)
+               )
+            );
+      }
+
+      if (isU32(cc_op, X86G_CC_OP_COPY) 
+          && (isU32(cond, X86CondP) || isU32(cond, X86CondNP))) {
+         /* COPY, then P --> extract P from dep1, and test (P == 1). */
+         /* COPY, then NP --> extract P from dep1, and test (P == 0). */
+         UInt nnn = isU32(cond, X86CondP) ? 1 : 0;
+         return
+            unop(
+               Iop_1Uto32,
+               binop(
+                  Iop_CmpEQ32,
+                  binop(
+                     Iop_And32,
+                     binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_P)),
+                     mkU32(1)
+                  ),
+                  mkU32(nnn)
+               )
+            );
+      }
+
+      return NULL;
+   }
+
+   /* --------- specialising "x86g_calculate_eflags_c" --------- */
+
+   if (vex_streq(function_name, "x86g_calculate_eflags_c")) {
+      /* specialise calls to above "calculate_eflags_c" function */
+      IRExpr *cc_op, *cc_dep1, *cc_dep2, *cc_ndep;
+      vassert(arity == 4);
+      cc_op   = args[0];
+      cc_dep1 = args[1];
+      cc_dep2 = args[2];
+      cc_ndep = args[3];
+
+      if (isU32(cc_op, X86G_CC_OP_SUBL)) {
+         /* C after sub denotes unsigned less than */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLT32U, cc_dep1, cc_dep2));
+      }
+      if (isU32(cc_op, X86G_CC_OP_SUBB)) {
+         /* C after sub denotes unsigned less than */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLT32U, 
+                           binop(Iop_And32,cc_dep1,mkU32(0xFF)),
+                           binop(Iop_And32,cc_dep2,mkU32(0xFF))));
+      }
+      if (isU32(cc_op, X86G_CC_OP_LOGICL)
+          || isU32(cc_op, X86G_CC_OP_LOGICW)
+          || isU32(cc_op, X86G_CC_OP_LOGICB)) {
+         /* cflag after logic is zero */
+         return mkU32(0);
+      }
+      if (isU32(cc_op, X86G_CC_OP_DECL) || isU32(cc_op, X86G_CC_OP_INCL)) {
+         /* If the thunk is dec or inc, the cflag is supplied as CC_NDEP. */
+         return cc_ndep;
+      }
+      if (isU32(cc_op, X86G_CC_OP_COPY)) {
+         /* cflag after COPY is stored in DEP1. */
+         return
+            binop(
+               Iop_And32,
+               binop(Iop_Shr32, cc_dep1, mkU8(X86G_CC_SHIFT_C)),
+               mkU32(1)
+            );
+      }
+      if (isU32(cc_op, X86G_CC_OP_ADDL)) {
+         /* C after add denotes sum <u either arg */
+         return unop(Iop_1Uto32,
+                     binop(Iop_CmpLT32U, 
+                           binop(Iop_Add32, cc_dep1, cc_dep2), 
+                           cc_dep1));
+      }
+      // ATC, requires verification, no test case known
+      //if (isU32(cc_op, X86G_CC_OP_SMULL)) {
+      //   /* C after signed widening multiply denotes the case where
+      //      the top half of the result isn't simply the sign extension
+      //      of the bottom half (iow the result doesn't fit completely
+      //      in the bottom half).  Hence: 
+      //        C = hi-half(dep1 x dep2) != lo-half(dep1 x dep2) >>s 31 
+      //      where 'x' denotes signed widening multiply.*/
+      //   return 
+      //      unop(Iop_1Uto32,
+      //           binop(Iop_CmpNE32, 
+      //                 unop(Iop_64HIto32,
+      //                      binop(Iop_MullS32, cc_dep1, cc_dep2)),
+      //                 binop(Iop_Sar32,
+      //                       binop(Iop_Mul32, cc_dep1, cc_dep2), mkU8(31)) ));
+      //}
+#     if 0
+      if (cc_op->tag == Iex_Const) {
+         vex_printf("CFLAG "); ppIRExpr(cc_op); vex_printf("\n");
+      }
+#     endif
+
+      return NULL;
+   }
+
+   /* --------- specialising "x86g_calculate_eflags_all" --------- */
+
+   if (vex_streq(function_name, "x86g_calculate_eflags_all")) {
+      /* specialise calls to above "calculate_eflags_all" function */
+      IRExpr *cc_op, *cc_dep1; /*, *cc_dep2, *cc_ndep; */
+      vassert(arity == 4);
+      cc_op   = args[0];
+      cc_dep1 = args[1];
+      /* cc_dep2 = args[2]; */
+      /* cc_ndep = args[3]; */
+
+      if (isU32(cc_op, X86G_CC_OP_COPY)) {
+         /* eflags after COPY are stored in DEP1. */
+         return
+            binop(
+               Iop_And32,
+               cc_dep1,
+               mkU32(X86G_CC_MASK_O | X86G_CC_MASK_S | X86G_CC_MASK_Z 
+                     | X86G_CC_MASK_A | X86G_CC_MASK_C | X86G_CC_MASK_P)
+            );
+      }
+      return NULL;
+   }
+
+#  undef unop
+#  undef binop
+#  undef mkU32
+#  undef mkU8
+
+   return NULL;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Supporting functions for x87 FPU activities.            ---*/
+/*---------------------------------------------------------------*/
+
+static inline Bool host_is_little_endian ( void )
+{
+   UInt x = 0x76543210;
+   UChar* p = (UChar*)(&x);
+   return toBool(*p == 0x10);
+}
+
+/* 80 and 64-bit floating point formats:
+
+   80-bit:
+
+    S  0       0-------0      zero
+    S  0       0X------X      denormals
+    S  1-7FFE  1X------X      normals (all normals have leading 1)
+    S  7FFF    10------0      infinity
+    S  7FFF    10X-----X      snan
+    S  7FFF    11X-----X      qnan
+
+   S is the sign bit.  For runs X----X, at least one of the Xs must be
+   nonzero.  Exponent is 15 bits, fractional part is 63 bits, and
+   there is an explicitly represented leading 1, and a sign bit,
+   giving 80 in total.
+
+   64-bit avoids the confusion of an explicitly represented leading 1
+   and so is simpler:
+
+    S  0      0------0   zero
+    S  0      X------X   denormals
+    S  1-7FE  any        normals
+    S  7FF    0------0   infinity
+    S  7FF    0X-----X   snan
+    S  7FF    1X-----X   qnan
+
+   Exponent is 11 bits, fractional part is 52 bits, and there is a 
+   sign bit, giving 64 in total.
+*/
+
+/* Inspect a value and its tag, as per the x87 'FXAM' instruction. */
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+UInt x86g_calculate_FXAM ( UInt tag, ULong dbl ) 
+{
+   Bool   mantissaIsZero;
+   Int    bexp;
+   UChar  sign;
+   UChar* f64;
+
+   vassert(host_is_little_endian());
+
+   /* vex_printf("calculate_FXAM ( %d, %llx ) .. ", tag, dbl ); */
+
+   f64  = (UChar*)(&dbl);
+   sign = toUChar( (f64[7] >> 7) & 1 );
+
+   /* First off, if the tag indicates the register was empty,
+      return 1,0,sign,1 */
+   if (tag == 0) {
+      /* vex_printf("Empty\n"); */
+      return X86G_FC_MASK_C3 | 0 | (sign << X86G_FC_SHIFT_C1) 
+                                 | X86G_FC_MASK_C0;
+   }
+
+   bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
+   bexp &= 0x7FF;
+
+   mantissaIsZero
+      = toBool(
+           (f64[6] & 0x0F) == 0 
+           && (f64[5] | f64[4] | f64[3] | f64[2] | f64[1] | f64[0]) == 0
+        );
+
+   /* If both exponent and mantissa are zero, the value is zero.
+      Return 1,0,sign,0. */
+   if (bexp == 0 && mantissaIsZero) {
+      /* vex_printf("Zero\n"); */
+      return X86G_FC_MASK_C3 | 0 
+                             | (sign << X86G_FC_SHIFT_C1) | 0;
+   }
+   
+   /* If exponent is zero but mantissa isn't, it's a denormal.
+      Return 1,1,sign,0. */
+   if (bexp == 0 && !mantissaIsZero) {
+      /* vex_printf("Denormal\n"); */
+      return X86G_FC_MASK_C3 | X86G_FC_MASK_C2 
+                             | (sign << X86G_FC_SHIFT_C1) | 0;
+   }
+
+   /* If the exponent is 7FF and the mantissa is zero, this is an infinity.
+      Return 0,1,sign,1. */
+   if (bexp == 0x7FF && mantissaIsZero) {
+      /* vex_printf("Inf\n"); */
+      return 0 | X86G_FC_MASK_C2 | (sign << X86G_FC_SHIFT_C1) 
+                                 | X86G_FC_MASK_C0;
+   }
+
+   /* If the exponent is 7FF and the mantissa isn't zero, this is a NaN.
+      Return 0,0,sign,1. */
+   if (bexp == 0x7FF && !mantissaIsZero) {
+      /* vex_printf("NaN\n"); */
+      return 0 | 0 | (sign << X86G_FC_SHIFT_C1) | X86G_FC_MASK_C0;
+   }
+
+   /* Uh, ok, we give up.  It must be a normal finite number.
+      Return 0,1,sign,0.
+   */
+   /* vex_printf("normal\n"); */
+   return 0 | X86G_FC_MASK_C2 | (sign << X86G_FC_SHIFT_C1) | 0;
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest memory) */
+ULong x86g_dirtyhelper_loadF80le ( UInt addrU )
+{
+   ULong f64;
+   convert_f80le_to_f64le ( (UChar*)ULong_to_Ptr(addrU), (UChar*)&f64 );
+   return f64;
+}
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (writes guest memory) */
+void x86g_dirtyhelper_storeF80le ( UInt addrU, ULong f64 )
+{
+   convert_f64le_to_f80le( (UChar*)&f64, (UChar*)ULong_to_Ptr(addrU) );
+}
+
+
+/*----------------------------------------------*/
+/*--- The exported fns ..                    ---*/
+/*----------------------------------------------*/
+
+/* Layout of the real x87 state. */
+/* 13 June 05: Fpu_State and auxiliary constants was moved to
+   g_generic_x87.h */
+
+
+/* CLEAN HELPER */
+/* fpucw[15:0] contains a x87 native format FPU control word.
+   Extract from it the required FPROUND value and any resulting
+   emulation warning, and return (warn << 32) | fpround value. 
+*/
+ULong x86g_check_fldcw ( UInt fpucw )
+{
+   /* Decide on a rounding mode.  fpucw[11:10] holds it. */
+   /* NOTE, encoded exactly as per enum IRRoundingMode. */
+   UInt rmode = (fpucw >> 10) & 3;
+
+   /* Detect any required emulation warnings. */
+   VexEmWarn ew = EmWarn_NONE;
+
+   if ((fpucw & 0x3F) != 0x3F) {
+      /* unmasked exceptions! */
+      ew = EmWarn_X86_x87exns;
+   }
+   else 
+   if (((fpucw >> 8) & 3) != 3) {
+      /* unsupported precision */
+      ew = EmWarn_X86_x87precision;
+   }
+
+   return (((ULong)ew) << 32) | ((ULong)rmode);
+}
+
+/* CLEAN HELPER */
+/* Given fpround as an IRRoundingMode value, create a suitable x87
+   native format FPU control word. */
+UInt x86g_create_fpucw ( UInt fpround )
+{
+   fpround &= 3;
+   return 0x037F | (fpround << 10);
+}
+
+
+/* CLEAN HELPER */
+/* mxcsr[15:0] contains a SSE native format MXCSR value.
+   Extract from it the required SSEROUND value and any resulting
+   emulation warning, and return (warn << 32) | sseround value.
+*/
+ULong x86g_check_ldmxcsr ( UInt mxcsr )
+{
+   /* Decide on a rounding mode.  mxcsr[14:13] holds it. */
+   /* NOTE, encoded exactly as per enum IRRoundingMode. */
+   UInt rmode = (mxcsr >> 13) & 3;
+
+   /* Detect any required emulation warnings. */
+   VexEmWarn ew = EmWarn_NONE;
+
+   if ((mxcsr & 0x1F80) != 0x1F80) {
+      /* unmasked exceptions! */
+      ew = EmWarn_X86_sseExns;
+   }
+   else 
+   if (mxcsr & (1<<15)) {
+      /* FZ is set */
+      ew = EmWarn_X86_fz;
+   } 
+   else
+   if (mxcsr & (1<<6)) {
+      /* DAZ is set */
+      ew = EmWarn_X86_daz;
+   }
+
+   return (((ULong)ew) << 32) | ((ULong)rmode);
+}
+
+
+/* CLEAN HELPER */
+/* Given sseround as an IRRoundingMode value, create a suitable SSE
+   native format MXCSR value. */
+UInt x86g_create_mxcsr ( UInt sseround )
+{
+   sseround &= 3;
+   return 0x1F80 | (sseround << 13);
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (writes guest state) */
+/* Initialise the x87 FPU state as per 'finit'. */
+void x86g_dirtyhelper_FINIT ( VexGuestX86State* gst )
+{
+   Int i;
+   gst->guest_FTOP = 0;
+   for (i = 0; i < 8; i++) {
+      gst->guest_FPTAG[i] = 0; /* empty */
+      gst->guest_FPREG[i] = 0; /* IEEE754 64-bit zero */
+   }
+   gst->guest_FPROUND = (UInt)Irrm_NEAREST;
+   gst->guest_FC3210  = 0;
+}
+
+
+/* This is used to implement both 'frstor' and 'fldenv'.  The latter
+   appears to differ from the former only in that the 8 FP registers
+   themselves are not transferred into the guest state. */
+static
+VexEmWarn do_put_x87 ( Bool moveRegs,
+                       /*IN*/UChar* x87_state,
+                       /*OUT*/VexGuestX86State* vex_state )
+{
+   Int        stno, preg;
+   UInt       tag;
+   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
+   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
+   Fpu_State* x87     = (Fpu_State*)x87_state;
+   UInt       ftop    = (x87->env[FP_ENV_STAT] >> 11) & 7;
+   UInt       tagw    = x87->env[FP_ENV_TAG];
+   UInt       fpucw   = x87->env[FP_ENV_CTRL];
+   UInt       c3210   = x87->env[FP_ENV_STAT] & 0x4700;
+   VexEmWarn  ew;
+   UInt       fpround;
+   ULong      pair;
+
+   /* Copy registers and tags */
+   for (stno = 0; stno < 8; stno++) {
+      preg = (stno + ftop) & 7;
+      tag = (tagw >> (2*preg)) & 3;
+      if (tag == 3) {
+         /* register is empty */
+         /* hmm, if it's empty, does it still get written?  Probably
+            safer to say it does.  If we don't, memcheck could get out
+            of sync, in that it thinks all FP registers are defined by
+            this helper, but in reality some have not been updated. */
+         if (moveRegs)
+            vexRegs[preg] = 0; /* IEEE754 64-bit zero */
+         vexTags[preg] = 0;
+      } else {
+         /* register is non-empty */
+         if (moveRegs)
+            convert_f80le_to_f64le( &x87->reg[10*stno], 
+                                    (UChar*)&vexRegs[preg] );
+         vexTags[preg] = 1;
+      }
+   }
+
+   /* stack pointer */
+   vex_state->guest_FTOP = ftop;
+
+   /* status word */
+   vex_state->guest_FC3210 = c3210;
+
+   /* handle the control word, setting FPROUND and detecting any
+      emulation warnings. */
+   pair    = x86g_check_fldcw ( (UInt)fpucw );
+   fpround = (UInt)pair;
+   ew      = (VexEmWarn)(pair >> 32);
+   
+   vex_state->guest_FPROUND = fpround & 3;
+
+   /* emulation warnings --> caller */
+   return ew;
+}
+
+
+/* Create an x87 FPU state from the guest state, as close as
+   we can approximate it. */
+static
+void do_get_x87 ( /*IN*/VexGuestX86State* vex_state,
+                  /*OUT*/UChar* x87_state )
+{
+   Int        i, stno, preg;
+   UInt       tagw;
+   ULong*     vexRegs = (ULong*)(&vex_state->guest_FPREG[0]);
+   UChar*     vexTags = (UChar*)(&vex_state->guest_FPTAG[0]);
+   Fpu_State* x87     = (Fpu_State*)x87_state;
+   UInt       ftop    = vex_state->guest_FTOP;
+   UInt       c3210   = vex_state->guest_FC3210;
+
+   for (i = 0; i < 14; i++)
+      x87->env[i] = 0;
+
+   x87->env[1] = x87->env[3] = x87->env[5] = x87->env[13] = 0xFFFF;
+   x87->env[FP_ENV_STAT] 
+      = toUShort(((ftop & 7) << 11) | (c3210 & 0x4700));
+   x87->env[FP_ENV_CTRL] 
+      = toUShort(x86g_create_fpucw( vex_state->guest_FPROUND ));
+
+   /* Dump the register stack in ST order. */
+   tagw = 0;
+   for (stno = 0; stno < 8; stno++) {
+      preg = (stno + ftop) & 7;
+      if (vexTags[preg] == 0) {
+         /* register is empty */
+         tagw |= (3 << (2*preg));
+         convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 
+                                 &x87->reg[10*stno] );
+      } else {
+         /* register is full. */
+         tagw |= (0 << (2*preg));
+         convert_f64le_to_f80le( (UChar*)&vexRegs[preg], 
+                                 &x87->reg[10*stno] );
+      }
+   }
+   x87->env[FP_ENV_TAG] = toUShort(tagw);
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest state, writes guest mem) */
+void x86g_dirtyhelper_FXSAVE ( VexGuestX86State* gst, HWord addr )
+{
+   /* Somewhat roundabout, but at least it's simple. */
+   Fpu_State tmp;
+   UShort*   addrS = (UShort*)addr;
+   UChar*    addrC = (UChar*)addr;
+   U128*     xmm   = (U128*)(addr + 160);
+   UInt      mxcsr;
+   UShort    fp_tags;
+   UInt      summary_tags;
+   Int       r, stno;
+   UShort    *srcS, *dstS;
+
+   do_get_x87( gst, (UChar*)&tmp );
+   mxcsr = x86g_create_mxcsr( gst->guest_SSEROUND );
+
+   /* Now build the proper fxsave image from the x87 image we just
+      made. */
+
+   addrS[0]  = tmp.env[FP_ENV_CTRL]; /* FCW: fpu control word */
+   addrS[1]  = tmp.env[FP_ENV_STAT]; /* FCW: fpu status word */
+
+   /* set addrS[2] in an endian-independent way */
+   summary_tags = 0;
+   fp_tags = tmp.env[FP_ENV_TAG];
+   for (r = 0; r < 8; r++) {
+      if ( ((fp_tags >> (2*r)) & 3) != 3 )
+         summary_tags |= (1 << r);
+   }
+   addrC[4]  = toUChar(summary_tags); /* FTW: tag summary byte */
+   addrC[5]  = 0; /* pad */
+
+   addrS[3]  = 0; /* FOP: fpu opcode (bogus) */
+   addrS[4]  = 0;
+   addrS[5]  = 0; /* FPU IP (bogus) */
+   addrS[6]  = 0; /* FPU IP's segment selector (bogus) (although we
+                     could conceivably dump %CS here) */
+
+   addrS[7]  = 0; /* Intel reserved */
+
+   addrS[8]  = 0; /* FPU DP (operand pointer) (bogus) */
+   addrS[9]  = 0; /* FPU DP (operand pointer) (bogus) */
+   addrS[10] = 0; /* segment selector for above operand pointer; %DS
+                     perhaps? */
+   addrS[11] = 0; /* Intel reserved */
+
+   addrS[12] = toUShort(mxcsr);  /* MXCSR */
+   addrS[13] = toUShort(mxcsr >> 16);
+
+   addrS[14] = 0xFFFF; /* MXCSR mask (lo16); who knows what for */
+   addrS[15] = 0xFFFF; /* MXCSR mask (hi16); who knows what for */
+
+   /* Copy in the FP registers, in ST order. */
+   for (stno = 0; stno < 8; stno++) {
+      srcS = (UShort*)(&tmp.reg[10*stno]);
+      dstS = (UShort*)(&addrS[16 + 8*stno]);
+      dstS[0] = srcS[0];
+      dstS[1] = srcS[1];
+      dstS[2] = srcS[2];
+      dstS[3] = srcS[3];
+      dstS[4] = srcS[4];
+      dstS[5] = 0;
+      dstS[6] = 0;
+      dstS[7] = 0;
+   }
+
+   /* That's the first 160 bytes of the image done.  Now only %xmm0
+      .. %xmm7 remain to be copied.  If the host is big-endian, these
+      need to be byte-swapped. */
+   vassert(host_is_little_endian());
+
+#  define COPY_U128(_dst,_src)                       \
+      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
+           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
+      while (0)
+
+   COPY_U128( xmm[0], gst->guest_XMM0 );
+   COPY_U128( xmm[1], gst->guest_XMM1 );
+   COPY_U128( xmm[2], gst->guest_XMM2 );
+   COPY_U128( xmm[3], gst->guest_XMM3 );
+   COPY_U128( xmm[4], gst->guest_XMM4 );
+   COPY_U128( xmm[5], gst->guest_XMM5 );
+   COPY_U128( xmm[6], gst->guest_XMM6 );
+   COPY_U128( xmm[7], gst->guest_XMM7 );
+
+#  undef COPY_U128
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (writes guest state, reads guest mem) */
+VexEmWarn x86g_dirtyhelper_FXRSTOR ( VexGuestX86State* gst, HWord addr )
+{
+   Fpu_State tmp;
+   VexEmWarn warnX87 = EmWarn_NONE;
+   VexEmWarn warnXMM = EmWarn_NONE;
+   UShort*   addrS   = (UShort*)addr;
+   UChar*    addrC   = (UChar*)addr;
+   U128*     xmm     = (U128*)(addr + 160);
+   UShort    fp_tags;
+   Int       r, stno, i;
+
+   /* Restore %xmm0 .. %xmm7.  If the host is big-endian, these need
+      to be byte-swapped. */
+   vassert(host_is_little_endian());
+
+#  define COPY_U128(_dst,_src)                       \
+      do { _dst[0] = _src[0]; _dst[1] = _src[1];     \
+           _dst[2] = _src[2]; _dst[3] = _src[3]; }   \
+      while (0)
+
+   COPY_U128( gst->guest_XMM0, xmm[0] );
+   COPY_U128( gst->guest_XMM1, xmm[1] );
+   COPY_U128( gst->guest_XMM2, xmm[2] );
+   COPY_U128( gst->guest_XMM3, xmm[3] );
+   COPY_U128( gst->guest_XMM4, xmm[4] );
+   COPY_U128( gst->guest_XMM5, xmm[5] );
+   COPY_U128( gst->guest_XMM6, xmm[6] );
+   COPY_U128( gst->guest_XMM7, xmm[7] );
+
+#  undef COPY_U128
+
+   /* Copy the x87 registers out of the image, into a temporary
+      Fpu_State struct. */
+   for (i = 0; i < 14; i++) tmp.env[i] = 0;
+   for (i = 0; i < 80; i++) tmp.reg[i] = 0;
+   /* fill in tmp.reg[0..7] */
+   for (stno = 0; stno < 8; stno++) {
+      UShort* dstS = (UShort*)(&tmp.reg[10*stno]);
+      UShort* srcS = (UShort*)(&addrS[16 + 8*stno]);
+      dstS[0] = srcS[0];
+      dstS[1] = srcS[1];
+      dstS[2] = srcS[2];
+      dstS[3] = srcS[3];
+      dstS[4] = srcS[4];
+   }
+   /* fill in tmp.env[0..13] */
+   tmp.env[FP_ENV_CTRL] = addrS[0]; /* FCW: fpu control word */
+   tmp.env[FP_ENV_STAT] = addrS[1]; /* FCW: fpu status word */
+
+   fp_tags = 0;
+   for (r = 0; r < 8; r++) {
+      if (addrC[4] & (1<<r))
+         fp_tags |= (0 << (2*r)); /* EMPTY */
+      else 
+         fp_tags |= (3 << (2*r)); /* VALID -- not really precise enough. */
+   }
+   tmp.env[FP_ENV_TAG] = fp_tags;
+
+   /* Now write 'tmp' into the guest state. */
+   warnX87 = do_put_x87( True/*moveRegs*/, (UChar*)&tmp, gst );
+
+   { UInt w32 = (((UInt)addrS[12]) & 0xFFFF)
+                | ((((UInt)addrS[13]) & 0xFFFF) << 16);
+     ULong w64 = x86g_check_ldmxcsr( w32 );
+
+     warnXMM = (VexEmWarn)(w64 >> 32);
+
+     gst->guest_SSEROUND = (UInt)w64;
+   }
+
+   /* Prefer an X87 emwarn over an XMM one, if both exist. */
+   if (warnX87 != EmWarn_NONE)
+      return warnX87;
+   else
+      return warnXMM;
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest state, writes guest mem) */
+void x86g_dirtyhelper_FSAVE ( VexGuestX86State* gst, HWord addr )
+{
+   do_get_x87( gst, (UChar*)addr );
+}
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (writes guest state, reads guest mem) */
+VexEmWarn x86g_dirtyhelper_FRSTOR ( VexGuestX86State* gst, HWord addr )
+{
+   return do_put_x87( True/*regs too*/, (UChar*)addr, gst );
+}
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (reads guest state, writes guest mem) */
+void x86g_dirtyhelper_FSTENV ( VexGuestX86State* gst, HWord addr )
+{
+   /* Somewhat roundabout, but at least it's simple. */
+   Int       i;
+   UShort*   addrP = (UShort*)addr;
+   Fpu_State tmp;
+   do_get_x87( gst, (UChar*)&tmp );
+   for (i = 0; i < 14; i++)
+      addrP[i] = tmp.env[i];
+}
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (writes guest state, reads guest mem) */
+VexEmWarn x86g_dirtyhelper_FLDENV ( VexGuestX86State* gst, HWord addr )
+{
+   return do_put_x87( False/*don't move regs*/, (UChar*)addr, gst);
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Misc integer helpers, including rotates and CPUID.      ---*/
+/*---------------------------------------------------------------*/
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate both flags and value result for rotate right
+   through the carry bit.  Result in low 32 bits, 
+   new flags (OSZACP) in high 32 bits.
+*/
+ULong x86g_calculate_RCR ( UInt arg, UInt rot_amt, UInt eflags_in, UInt sz )
+{
+   UInt tempCOUNT = rot_amt & 0x1F, cf=0, of=0, tempcf;
+
+   switch (sz) {
+      case 4:
+         cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
+         of        = ((arg >> 31) ^ cf) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = arg & 1;
+            arg    = (arg >> 1) | (cf << 31);
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         break;
+      case 2:
+         while (tempCOUNT >= 17) tempCOUNT -= 17;
+         cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
+         of        = ((arg >> 15) ^ cf) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = arg & 1;
+            arg    = ((arg >> 1) & 0x7FFF) | (cf << 15);
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         break;
+      case 1:
+         while (tempCOUNT >= 9) tempCOUNT -= 9;
+         cf        = (eflags_in >> X86G_CC_SHIFT_C) & 1;
+         of        = ((arg >> 7) ^ cf) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = arg & 1;
+            arg    = ((arg >> 1) & 0x7F) | (cf << 7);
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         break;
+      default: 
+         vpanic("calculate_RCR: invalid size");
+   }
+
+   cf &= 1;
+   of &= 1;
+   eflags_in &= ~(X86G_CC_MASK_C | X86G_CC_MASK_O);
+   eflags_in |= (cf << X86G_CC_SHIFT_C) | (of << X86G_CC_SHIFT_O);
+
+   return (((ULong)eflags_in) << 32) | ((ULong)arg);
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate both flags and value result for rotate left
+   through the carry bit.  Result in low 32 bits, 
+   new flags (OSZACP) in high 32 bits.
+*/
+ULong x86g_calculate_RCL ( UInt arg, UInt rot_amt, UInt eflags_in, UInt sz )
+{
+   UInt tempCOUNT = rot_amt & 0x1F, cf=0, of=0, tempcf;
+
+   switch (sz) {
+      case 4:
+         cf = (eflags_in >> X86G_CC_SHIFT_C) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = (arg >> 31) & 1;
+            arg    = (arg << 1) | (cf & 1);
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         of = ((arg >> 31) ^ cf) & 1;
+         break;
+      case 2:
+         while (tempCOUNT >= 17) tempCOUNT -= 17;
+         cf = (eflags_in >> X86G_CC_SHIFT_C) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = (arg >> 15) & 1;
+            arg    = 0xFFFF & ((arg << 1) | (cf & 1));
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         of = ((arg >> 15) ^ cf) & 1;
+         break;
+      case 1:
+         while (tempCOUNT >= 9) tempCOUNT -= 9;
+         cf = (eflags_in >> X86G_CC_SHIFT_C) & 1;
+         while (tempCOUNT > 0) {
+            tempcf = (arg >> 7) & 1;
+            arg    = 0xFF & ((arg << 1) | (cf & 1));
+            cf     = tempcf;
+            tempCOUNT--;
+         }
+         of = ((arg >> 7) ^ cf) & 1;
+         break;
+      default: 
+         vpanic("calculate_RCL: invalid size");
+   }
+
+   cf &= 1;
+   of &= 1;
+   eflags_in &= ~(X86G_CC_MASK_C | X86G_CC_MASK_O);
+   eflags_in |= (cf << X86G_CC_SHIFT_C) | (of << X86G_CC_SHIFT_O);
+
+   return (((ULong)eflags_in) << 32) | ((ULong)arg);
+}
+
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+/* Calculate both flags and value result for DAA/DAS/AAA/AAS.
+   AX value in low half of arg, OSZACP in upper half.
+   See guest-x86/toIR.c usage point for details.
+*/
+static UInt calc_parity_8bit ( UInt w32 ) {
+   UInt i;
+   UInt p = 1;
+   for (i = 0; i < 8; i++)
+      p ^= (1 & (w32 >> i));
+   return p;
+}
+UInt x86g_calculate_daa_das_aaa_aas ( UInt flags_and_AX, UInt opcode )
+{
+   UInt r_AL = (flags_and_AX >> 0) & 0xFF;
+   UInt r_AH = (flags_and_AX >> 8) & 0xFF;
+   UInt r_O  = (flags_and_AX >> (16 + X86G_CC_SHIFT_O)) & 1;
+   UInt r_S  = (flags_and_AX >> (16 + X86G_CC_SHIFT_S)) & 1;
+   UInt r_Z  = (flags_and_AX >> (16 + X86G_CC_SHIFT_Z)) & 1;
+   UInt r_A  = (flags_and_AX >> (16 + X86G_CC_SHIFT_A)) & 1;
+   UInt r_C  = (flags_and_AX >> (16 + X86G_CC_SHIFT_C)) & 1;
+   UInt r_P  = (flags_and_AX >> (16 + X86G_CC_SHIFT_P)) & 1;
+   UInt result = 0;
+
+   switch (opcode) {
+      case 0x27: { /* DAA */
+         UInt old_AL = r_AL;
+         UInt old_C  = r_C;
+         r_C = 0;
+         if ((r_AL & 0xF) > 9 || r_A == 1) {
+            r_AL = r_AL + 6;
+            r_C  = old_C;
+            if (r_AL >= 0x100) r_C = 1;
+            r_A = 1;
+         } else {
+            r_A = 0;
+         }
+         if (old_AL > 0x99 || old_C == 1) {
+            r_AL = r_AL + 0x60;
+            r_C  = 1;
+         } else {
+            r_C = 0;
+         }
+         /* O is undefined.  S Z and P are set according to the
+	    result. */
+         r_AL &= 0xFF;
+         r_O = 0; /* let's say */
+         r_S = (r_AL & 0x80) ? 1 : 0;
+         r_Z = (r_AL == 0) ? 1 : 0;
+         r_P = calc_parity_8bit( r_AL );
+         break;
+      }
+      case 0x2F: { /* DAS */
+         UInt old_AL = r_AL;
+         UInt old_C  = r_C;
+         r_C = 0;
+         if ((r_AL & 0xF) > 9 || r_A == 1) {
+            Bool borrow = r_AL < 6;
+            r_AL = r_AL - 6;
+            r_C  = old_C;
+            if (borrow) r_C = 1;
+            r_A = 1;
+         } else {
+            r_A = 0;
+         }
+         if (old_AL > 0x99 || old_C == 1) {
+            r_AL = r_AL - 0x60;
+            r_C  = 1;
+         } else {
+            /* Intel docs are wrong: r_C = 0; */
+         }
+         /* O is undefined.  S Z and P are set according to the
+	    result. */
+         r_AL &= 0xFF;
+         r_O = 0; /* let's say */
+         r_S = (r_AL & 0x80) ? 1 : 0;
+         r_Z = (r_AL == 0) ? 1 : 0;
+         r_P = calc_parity_8bit( r_AL );
+         break;
+      }
+      case 0x37: { /* AAA */
+         Bool nudge = r_AL > 0xF9;
+         if ((r_AL & 0xF) > 9 || r_A == 1) {
+            r_AL = r_AL + 6;
+            r_AH = r_AH + 1 + (nudge ? 1 : 0);
+            r_A  = 1;
+            r_C  = 1;
+            r_AL = r_AL & 0xF;
+         } else {
+            r_A  = 0;
+            r_C  = 0;
+            r_AL = r_AL & 0xF;
+         }
+         /* O S Z and P are undefined. */
+         r_O = r_S = r_Z = r_P = 0; /* let's say */
+         break;
+      }
+      case 0x3F: { /* AAS */
+         Bool nudge = r_AL < 0x06;
+         if ((r_AL & 0xF) > 9 || r_A == 1) {
+            r_AL = r_AL - 6;
+            r_AH = r_AH - 1 - (nudge ? 1 : 0);
+            r_A  = 1;
+            r_C  = 1;
+            r_AL = r_AL & 0xF;
+         } else {
+            r_A  = 0;
+            r_C  = 0;
+            r_AL = r_AL & 0xF;
+         }
+         /* O S Z and P are undefined. */
+         r_O = r_S = r_Z = r_P = 0; /* let's say */
+         break;
+      }
+      default:
+         vassert(0);
+   }
+   result =   ( (r_O & 1) << (16 + X86G_CC_SHIFT_O) )
+            | ( (r_S & 1) << (16 + X86G_CC_SHIFT_S) )
+            | ( (r_Z & 1) << (16 + X86G_CC_SHIFT_Z) )
+            | ( (r_A & 1) << (16 + X86G_CC_SHIFT_A) )
+            | ( (r_C & 1) << (16 + X86G_CC_SHIFT_C) )
+            | ( (r_P & 1) << (16 + X86G_CC_SHIFT_P) )
+            | ( (r_AH & 0xFF) << 8 )
+            | ( (r_AL & 0xFF) << 0 );
+   return result;
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (non-referentially-transparent) */
+/* Horrible hack.  On non-x86 platforms, return 1. */
+ULong x86g_dirtyhelper_RDTSC ( void )
+{
+#  if defined(__i386__)
+   ULong res;
+   __asm__ __volatile__("rdtsc" : "=A" (res));
+   return res;
+#  else
+   return 1ULL;
+#  endif
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (modifies guest state) */
+/* Claim to be a P55C (Intel Pentium/MMX) */
+void x86g_dirtyhelper_CPUID_sse0 ( VexGuestX86State* st )
+{
+   switch (st->guest_EAX) {
+      case 0: 
+         st->guest_EAX = 0x1;
+         st->guest_EBX = 0x756e6547;
+         st->guest_ECX = 0x6c65746e;
+         st->guest_EDX = 0x49656e69;
+         break;
+      default:
+         st->guest_EAX = 0x543;
+         st->guest_EBX = 0x0;
+         st->guest_ECX = 0x0;
+         st->guest_EDX = 0x8001bf;
+         break;
+   }
+}
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (modifies guest state) */
+/* Claim to be the following SSE1-capable CPU:
+   vendor_id       : GenuineIntel
+   cpu family      : 6
+   model           : 11
+   model name      : Intel(R) Pentium(R) III CPU family      1133MHz
+   stepping        : 1
+   cpu MHz         : 1131.013
+   cache size      : 512 KB
+*/
+void x86g_dirtyhelper_CPUID_sse1 ( VexGuestX86State* st )
+{
+   switch (st->guest_EAX) {
+      case 0: 
+         st->guest_EAX = 0x00000002;
+         st->guest_EBX = 0x756e6547;
+         st->guest_ECX = 0x6c65746e;
+         st->guest_EDX = 0x49656e69;
+         break;
+      case 1: 
+         st->guest_EAX = 0x000006b1;
+         st->guest_EBX = 0x00000004;
+         st->guest_ECX = 0x00000000;
+         st->guest_EDX = 0x0383fbff;
+         break;
+      default:
+         st->guest_EAX = 0x03020101;
+         st->guest_EBX = 0x00000000;
+         st->guest_ECX = 0x00000000;
+         st->guest_EDX = 0x0c040883;
+         break;
+   }
+}
+
+/* Claim to be the following SSSE3-capable CPU (2 x ...):
+   vendor_id       : GenuineIntel
+   cpu family      : 6
+   model           : 15
+   model name      : Intel(R) Core(TM)2 CPU 6600 @ 2.40GHz
+   stepping        : 6
+   cpu MHz         : 2394.000
+   cache size      : 4096 KB
+   physical id     : 0
+   siblings        : 2
+   core id         : 0
+   cpu cores       : 2
+   fpu             : yes
+   fpu_exception   : yes
+   cpuid level     : 10
+   wp              : yes
+   flags           : fpu vme de pse tsc msr pae mce cx8 apic sep
+                     mtrr pge mca cmov pat pse36 clflush dts acpi
+                     mmx fxsr sse sse2 ss ht tm syscall nx lm
+                     constant_tsc pni monitor ds_cpl vmx est tm2
+                     cx16 xtpr lahf_lm
+   bogomips        : 4798.78
+   clflush size    : 64
+   cache_alignment : 64
+   address sizes   : 36 bits physical, 48 bits virtual
+   power management:
+*/
+void x86g_dirtyhelper_CPUID_sse2 ( VexGuestX86State* st )
+{
+#  define SET_ABCD(_a,_b,_c,_d)               \
+      do { st->guest_EAX = (UInt)(_a);        \
+           st->guest_EBX = (UInt)(_b);        \
+           st->guest_ECX = (UInt)(_c);        \
+           st->guest_EDX = (UInt)(_d);        \
+      } while (0)
+
+   switch (st->guest_EAX) {
+      case 0x00000000:
+         SET_ABCD(0x0000000a, 0x756e6547, 0x6c65746e, 0x49656e69);
+         break;
+      case 0x00000001:
+         SET_ABCD(0x000006f6, 0x00020800, 0x0000e3bd, 0xbfebfbff);
+         break;
+      case 0x00000002:
+         SET_ABCD(0x05b0b101, 0x005657f0, 0x00000000, 0x2cb43049);
+         break;
+      case 0x00000003:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000004: {
+         switch (st->guest_ECX) {
+            case 0x00000000: SET_ABCD(0x04000121, 0x01c0003f,
+                                      0x0000003f, 0x00000001); break;
+            case 0x00000001: SET_ABCD(0x04000122, 0x01c0003f,
+                                      0x0000003f, 0x00000001); break;
+            case 0x00000002: SET_ABCD(0x04004143, 0x03c0003f,
+                                      0x00000fff, 0x00000001); break;
+            default:         SET_ABCD(0x00000000, 0x00000000,
+                                      0x00000000, 0x00000000); break;
+         }
+         break;
+      }
+      case 0x00000005:
+         SET_ABCD(0x00000040, 0x00000040, 0x00000003, 0x00000020);
+         break;
+      case 0x00000006:
+         SET_ABCD(0x00000001, 0x00000002, 0x00000001, 0x00000000);
+         break;
+      case 0x00000007:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000008:
+         SET_ABCD(0x00000400, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x00000009:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x0000000a:
+      unhandled_eax_value:
+         SET_ABCD(0x07280202, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000000:
+         SET_ABCD(0x80000008, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000001:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000001, 0x20100000);
+         break;
+      case 0x80000002:
+         SET_ABCD(0x65746e49, 0x2952286c, 0x726f4320, 0x4d542865);
+         break;
+      case 0x80000003:
+         SET_ABCD(0x43203229, 0x20205550, 0x20202020, 0x20202020);
+         break;
+      case 0x80000004:
+         SET_ABCD(0x30303636, 0x20402020, 0x30342e32, 0x007a4847);
+         break;
+      case 0x80000005:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000006:
+         SET_ABCD(0x00000000, 0x00000000, 0x10008040, 0x00000000);
+         break;
+      case 0x80000007:
+         SET_ABCD(0x00000000, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      case 0x80000008:
+         SET_ABCD(0x00003024, 0x00000000, 0x00000000, 0x00000000);
+         break;
+      default:
+         goto unhandled_eax_value;
+   }
+#  undef SET_ABCD
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (non-referentially-transparent) */
+/* Horrible hack.  On non-x86 platforms, return 0. */
+UInt x86g_dirtyhelper_IN ( UInt portno, UInt sz/*1,2 or 4*/ )
+{
+#  if defined(__i386__)
+   UInt r = 0;
+   portno &= 0xFFFF;
+   switch (sz) {
+      case 4: 
+         __asm__ __volatile__("movl $0,%%eax; inl %w1,%0" 
+                              : "=a" (r) : "Nd" (portno));
+	 break;
+      case 2: 
+         __asm__ __volatile__("movl $0,%%eax; inw %w1,%w0" 
+                              : "=a" (r) : "Nd" (portno));
+	 break;
+      case 1: 
+         __asm__ __volatile__("movl $0,%%eax; inb %w1,%b0" 
+                              : "=a" (r) : "Nd" (portno));
+	 break;
+      default:
+         break;
+   }
+   return r;
+#  else
+   return 0;
+#  endif
+}
+
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (non-referentially-transparent) */
+/* Horrible hack.  On non-x86 platforms, do nothing. */
+void x86g_dirtyhelper_OUT ( UInt portno, UInt data, UInt sz/*1,2 or 4*/ )
+{
+#  if defined(__i386__)
+   portno &= 0xFFFF;
+   switch (sz) {
+      case 4: 
+         __asm__ __volatile__("outl %0, %w1" 
+                              : : "a" (data), "Nd" (portno));
+	 break;
+      case 2: 
+         __asm__ __volatile__("outw %w0, %w1" 
+                              : : "a" (data), "Nd" (portno));
+	 break;
+      case 1: 
+         __asm__ __volatile__("outb %b0, %w1" 
+                              : : "a" (data), "Nd" (portno));
+	 break;
+      default:
+         break;
+   }
+#  else
+   /* do nothing */
+#  endif
+}
+
+/* CALLED FROM GENERATED CODE */
+/* DIRTY HELPER (non-referentially-transparent) */
+/* Horrible hack.  On non-x86 platforms, do nothing. */
+/* op = 0: call the native SGDT instruction.
+   op = 1: call the native SIDT instruction.
+*/
+void x86g_dirtyhelper_SxDT ( void *address, UInt op ) {
+#  if defined(__i386__)
+   switch (op) {
+      case 0:
+         __asm__ __volatile__("sgdt (%0)" : : "r" (address) : "memory");
+         break;
+      case 1:
+         __asm__ __volatile__("sidt (%0)" : : "r" (address) : "memory");
+         break;
+      default:
+         vpanic("x86g_dirtyhelper_SxDT");
+   }
+#  else
+   /* do nothing */
+   UChar* p = (UChar*)address;
+   p[0] = p[1] = p[2] = p[3] = p[4] = p[5] = 0;
+#  endif
+}
+
+/*---------------------------------------------------------------*/
+/*--- Helpers for MMX/SSE/SSE2.                               ---*/
+/*---------------------------------------------------------------*/
+
+static inline UChar abdU8 ( UChar xx, UChar yy ) {
+   return toUChar(xx>yy ? xx-yy : yy-xx);
+}
+
+static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
+   return (((ULong)w1) << 32) | ((ULong)w0);
+}
+
+static inline UShort sel16x4_3 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUShort(hi32 >> 16);
+}
+static inline UShort sel16x4_2 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUShort(hi32);
+}
+static inline UShort sel16x4_1 ( ULong w64 ) {
+   UInt lo32 = toUInt(w64);
+   return toUShort(lo32 >> 16);
+}
+static inline UShort sel16x4_0 ( ULong w64 ) {
+   UInt lo32 = toUInt(w64);
+   return toUShort(lo32);
+}
+
+static inline UChar sel8x8_7 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUChar(hi32 >> 24);
+}
+static inline UChar sel8x8_6 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUChar(hi32 >> 16);
+}
+static inline UChar sel8x8_5 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUChar(hi32 >> 8);
+}
+static inline UChar sel8x8_4 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUChar(hi32 >> 0);
+}
+static inline UChar sel8x8_3 ( ULong w64 ) {
+   UInt lo32 = toUInt(w64);
+   return toUChar(lo32 >> 24);
+}
+static inline UChar sel8x8_2 ( ULong w64 ) {
+   UInt lo32 = toUInt(w64);
+   return toUChar(lo32 >> 16);
+}
+static inline UChar sel8x8_1 ( ULong w64 ) {
+   UInt lo32 = toUInt(w64);
+   return toUChar(lo32 >> 8);
+}
+static inline UChar sel8x8_0 ( ULong w64 ) {
+   UInt lo32 = toUInt(w64);
+   return toUChar(lo32 >> 0);
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong x86g_calculate_mmx_pmaddwd ( ULong xx, ULong yy )
+{
+   return
+      mk32x2( 
+         (((Int)(Short)sel16x4_3(xx)) * ((Int)(Short)sel16x4_3(yy)))
+            + (((Int)(Short)sel16x4_2(xx)) * ((Int)(Short)sel16x4_2(yy))),
+         (((Int)(Short)sel16x4_1(xx)) * ((Int)(Short)sel16x4_1(yy)))
+            + (((Int)(Short)sel16x4_0(xx)) * ((Int)(Short)sel16x4_0(yy)))
+      );
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+UInt x86g_calculate_mmx_pmovmskb ( ULong xx )
+{
+   UInt r = 0;
+   if (xx & (1ULL << (64-1))) r |= (1<<7);
+   if (xx & (1ULL << (56-1))) r |= (1<<6);
+   if (xx & (1ULL << (48-1))) r |= (1<<5);
+   if (xx & (1ULL << (40-1))) r |= (1<<4);
+   if (xx & (1ULL << (32-1))) r |= (1<<3);
+   if (xx & (1ULL << (24-1))) r |= (1<<2);
+   if (xx & (1ULL << (16-1))) r |= (1<<1);
+   if (xx & (1ULL << ( 8-1))) r |= (1<<0);
+   return r;
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong x86g_calculate_mmx_psadbw ( ULong xx, ULong yy )
+{
+   UInt t = 0;
+   t += (UInt)abdU8( sel8x8_7(xx), sel8x8_7(yy) );
+   t += (UInt)abdU8( sel8x8_6(xx), sel8x8_6(yy) );
+   t += (UInt)abdU8( sel8x8_5(xx), sel8x8_5(yy) );
+   t += (UInt)abdU8( sel8x8_4(xx), sel8x8_4(yy) );
+   t += (UInt)abdU8( sel8x8_3(xx), sel8x8_3(yy) );
+   t += (UInt)abdU8( sel8x8_2(xx), sel8x8_2(yy) );
+   t += (UInt)abdU8( sel8x8_1(xx), sel8x8_1(yy) );
+   t += (UInt)abdU8( sel8x8_0(xx), sel8x8_0(yy) );
+   t &= 0xFFFF;
+   return (ULong)t;
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo )
+{
+   UInt rHi8 = x86g_calculate_mmx_pmovmskb ( w64hi );
+   UInt rLo8 = x86g_calculate_mmx_pmovmskb ( w64lo );
+   return ((rHi8 & 0xFF) << 8) | (rLo8 & 0xFF);
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Helpers for dealing with segment overrides.             ---*/
+/*---------------------------------------------------------------*/
+
+static inline 
+UInt get_segdescr_base ( VexGuestX86SegDescr* ent )
+{
+   UInt lo  = 0xFFFF & (UInt)ent->LdtEnt.Bits.BaseLow;
+   UInt mid =   0xFF & (UInt)ent->LdtEnt.Bits.BaseMid;
+   UInt hi  =   0xFF & (UInt)ent->LdtEnt.Bits.BaseHi;
+   return (hi << 24) | (mid << 16) | lo;
+}
+
+static inline
+UInt get_segdescr_limit ( VexGuestX86SegDescr* ent )
+{
+    UInt lo    = 0xFFFF & (UInt)ent->LdtEnt.Bits.LimitLow;
+    UInt hi    =    0xF & (UInt)ent->LdtEnt.Bits.LimitHi;
+    UInt limit = (hi << 16) | lo;
+    if (ent->LdtEnt.Bits.Granularity) 
+       limit = (limit << 12) | 0xFFF;
+    return limit;
+}
+
+/* CALLED FROM GENERATED CODE: CLEAN HELPER */
+ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
+                              UInt seg_selector, UInt virtual_addr )
+{
+   UInt tiBit, base, limit;
+   VexGuestX86SegDescr* the_descrs;
+
+   Bool verboze = False;
+
+   /* If this isn't true, we're in Big Trouble. */
+   vassert(8 == sizeof(VexGuestX86SegDescr));
+
+   if (verboze) 
+      vex_printf("x86h_use_seg_selector: "
+                 "seg_selector = 0x%x, vaddr = 0x%x\n", 
+                 seg_selector, virtual_addr);
+
+   /* Check for wildly invalid selector. */
+   if (seg_selector & ~0xFFFF)
+      goto bad;
+
+   seg_selector &= 0x0000FFFF;
+  
+   /* Sanity check the segment selector.  Ensure that RPL=11b (least
+      privilege).  This forms the bottom 2 bits of the selector. */
+   if ((seg_selector & 3) != 3)
+      goto bad;
+
+   /* Extract the TI bit (0 means GDT, 1 means LDT) */
+   tiBit = (seg_selector >> 2) & 1;
+
+   /* Convert the segment selector onto a table index */
+   seg_selector >>= 3;
+   vassert(seg_selector >= 0 && seg_selector < 8192);
+
+   if (tiBit == 0) {
+
+      /* GDT access. */
+      /* Do we actually have a GDT to look at? */
+      if (gdt == 0)
+         goto bad;
+
+      /* Check for access to non-existent entry. */
+      if (seg_selector >= VEX_GUEST_X86_GDT_NENT)
+         goto bad;
+
+      the_descrs = (VexGuestX86SegDescr*)gdt;
+      base  = get_segdescr_base (&the_descrs[seg_selector]);
+      limit = get_segdescr_limit(&the_descrs[seg_selector]);
+
+   } else {
+
+      /* All the same stuff, except for the LDT. */
+      if (ldt == 0)
+         goto bad;
+
+      if (seg_selector >= VEX_GUEST_X86_LDT_NENT)
+         goto bad;
+
+      the_descrs = (VexGuestX86SegDescr*)ldt;
+      base  = get_segdescr_base (&the_descrs[seg_selector]);
+      limit = get_segdescr_limit(&the_descrs[seg_selector]);
+
+   }
+
+   /* Do the limit check.  Note, this check is just slightly too
+      slack.  Really it should be "if (virtual_addr + size - 1 >=
+      limit)," but we don't have the size info to hand.  Getting it
+      could be significantly complex.  */
+   if (virtual_addr >= limit)
+      goto bad;
+
+   if (verboze) 
+      vex_printf("x86h_use_seg_selector: "
+                 "base = 0x%x, addr = 0x%x\n", 
+                 base, base + virtual_addr);
+
+   /* High 32 bits are zero, indicating success. */
+   return (ULong)( ((UInt)virtual_addr) + base );
+
+ bad:
+   return 1ULL << 32;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Helpers for dealing with, and describing,               ---*/
+/*--- guest state as a whole.                                 ---*/
+/*---------------------------------------------------------------*/
+
+/* Initialise the entire x86 guest state. */
+/* VISIBLE TO LIBVEX CLIENT */
+void LibVEX_GuestX86_initialise ( /*OUT*/VexGuestX86State* vex_state )
+{
+   vex_state->guest_EAX = 0;
+   vex_state->guest_ECX = 0;
+   vex_state->guest_EDX = 0;
+   vex_state->guest_EBX = 0;
+   vex_state->guest_ESP = 0;
+   vex_state->guest_EBP = 0;
+   vex_state->guest_ESI = 0;
+   vex_state->guest_EDI = 0;
+
+   vex_state->guest_CC_OP   = X86G_CC_OP_COPY;
+   vex_state->guest_CC_DEP1 = 0;
+   vex_state->guest_CC_DEP2 = 0;
+   vex_state->guest_CC_NDEP = 0;
+   vex_state->guest_DFLAG   = 1; /* forwards */
+   vex_state->guest_IDFLAG  = 0;
+   vex_state->guest_ACFLAG  = 0;
+
+   vex_state->guest_EIP = 0;
+
+   /* Initialise the simulated FPU */
+   x86g_dirtyhelper_FINIT( vex_state );
+
+   /* Initialse the SSE state. */
+#  define SSEZERO(_xmm) _xmm[0]=_xmm[1]=_xmm[2]=_xmm[3] = 0;
+
+   vex_state->guest_SSEROUND = (UInt)Irrm_NEAREST;
+   SSEZERO(vex_state->guest_XMM0);
+   SSEZERO(vex_state->guest_XMM1);
+   SSEZERO(vex_state->guest_XMM2);
+   SSEZERO(vex_state->guest_XMM3);
+   SSEZERO(vex_state->guest_XMM4);
+   SSEZERO(vex_state->guest_XMM5);
+   SSEZERO(vex_state->guest_XMM6);
+   SSEZERO(vex_state->guest_XMM7);
+
+#  undef SSEZERO
+
+   vex_state->guest_CS  = 0;
+   vex_state->guest_DS  = 0;
+   vex_state->guest_ES  = 0;
+   vex_state->guest_FS  = 0;
+   vex_state->guest_GS  = 0;
+   vex_state->guest_SS  = 0;
+   vex_state->guest_LDT = 0;
+   vex_state->guest_GDT = 0;
+
+   vex_state->guest_EMWARN = EmWarn_NONE;
+
+   /* SSE2 has a 'clflush' cache-line-invalidator which uses these. */
+   vex_state->guest_TISTART = 0;
+   vex_state->guest_TILEN   = 0;
+
+   vex_state->guest_NRADDR   = 0;
+   vex_state->guest_SC_CLASS = 0;
+   vex_state->guest_IP_AT_SYSCALL = 0;
+
+   vex_state->padding1 = 0;
+   vex_state->padding2 = 0;
+   vex_state->padding3 = 0;
+}
+
+
+/* Figure out if any part of the guest state contained in minoff
+   .. maxoff requires precise memory exceptions.  If in doubt return
+   True (but this is generates significantly slower code).  
+
+   By default we enforce precise exns for guest %ESP, %EBP and %EIP
+   only.  These are the minimum needed to extract correct stack
+   backtraces from x86 code.
+*/
+Bool guest_x86_state_requires_precise_mem_exns ( Int minoff, 
+                                                 Int maxoff)
+{
+   Int ebp_min = offsetof(VexGuestX86State, guest_EBP);
+   Int ebp_max = ebp_min + 4 - 1;
+   Int esp_min = offsetof(VexGuestX86State, guest_ESP);
+   Int esp_max = esp_min + 4 - 1;
+   Int eip_min = offsetof(VexGuestX86State, guest_EIP);
+   Int eip_max = eip_min + 4 - 1;
+
+   if (maxoff < ebp_min || minoff > ebp_max) {
+      /* no overlap with ebp */
+   } else {
+      return True;
+   }
+
+   if (maxoff < esp_min || minoff > esp_max) {
+      /* no overlap with esp */
+   } else {
+      return True;
+   }
+
+   if (maxoff < eip_min || minoff > eip_max) {
+      /* no overlap with eip */
+   } else {
+      return True;
+   }
+
+   return False;
+}
+
+
+#define ALWAYSDEFD(field)                           \
+    { offsetof(VexGuestX86State, field),            \
+      (sizeof ((VexGuestX86State*)0)->field) }
+
+VexGuestLayout
+   x86guest_layout 
+      = { 
+          /* Total size of the guest state, in bytes. */
+          .total_sizeB = sizeof(VexGuestX86State),
+
+          /* Describe the stack pointer. */
+          .offset_SP = offsetof(VexGuestX86State,guest_ESP),
+          .sizeof_SP = 4,
+
+          /* Describe the frame pointer. */
+          .offset_FP = offsetof(VexGuestX86State,guest_EBP),
+          .sizeof_FP = 4,
+
+          /* Describe the instruction pointer. */
+          .offset_IP = offsetof(VexGuestX86State,guest_EIP),
+          .sizeof_IP = 4,
+
+          /* Describe any sections to be regarded by Memcheck as
+             'always-defined'. */
+          .n_alwaysDefd = 24,
+
+          /* flags thunk: OP and NDEP are always defd, whereas DEP1
+             and DEP2 have to be tracked.  See detailed comment in
+             gdefs.h on meaning of thunk fields. */
+          .alwaysDefd 
+             = { /*  0 */ ALWAYSDEFD(guest_CC_OP),
+                 /*  1 */ ALWAYSDEFD(guest_CC_NDEP),
+                 /*  2 */ ALWAYSDEFD(guest_DFLAG),
+                 /*  3 */ ALWAYSDEFD(guest_IDFLAG),
+                 /*  4 */ ALWAYSDEFD(guest_ACFLAG),
+                 /*  5 */ ALWAYSDEFD(guest_EIP),
+                 /*  6 */ ALWAYSDEFD(guest_FTOP),
+                 /*  7 */ ALWAYSDEFD(guest_FPTAG),
+                 /*  8 */ ALWAYSDEFD(guest_FPROUND),
+                 /*  9 */ ALWAYSDEFD(guest_FC3210),
+                 /* 10 */ ALWAYSDEFD(guest_CS),
+                 /* 11 */ ALWAYSDEFD(guest_DS),
+                 /* 12 */ ALWAYSDEFD(guest_ES),
+                 /* 13 */ ALWAYSDEFD(guest_FS),
+                 /* 14 */ ALWAYSDEFD(guest_GS),
+                 /* 15 */ ALWAYSDEFD(guest_SS),
+                 /* 16 */ ALWAYSDEFD(guest_LDT),
+                 /* 17 */ ALWAYSDEFD(guest_GDT),
+                 /* 18 */ ALWAYSDEFD(guest_EMWARN),
+                 /* 19 */ ALWAYSDEFD(guest_SSEROUND),
+                 /* 20 */ ALWAYSDEFD(guest_TISTART),
+                 /* 21 */ ALWAYSDEFD(guest_TILEN),
+                 /* 22 */ ALWAYSDEFD(guest_SC_CLASS),
+                 /* 23 */ ALWAYSDEFD(guest_IP_AT_SYSCALL)
+               }
+        };
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                 guest_x86_helpers.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/guest_x86_toIR.c b/VEX/priv/guest_x86_toIR.c
new file mode 100644
index 0000000..d03b6f1
--- /dev/null
+++ b/VEX/priv/guest_x86_toIR.c

@@ -0,0 +1,15138 @@
+
+/*--------------------------------------------------------------------*/
+/*--- begin                                       guest_x86_toIR.c ---*/
+/*--------------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+/* Translates x86 code to IR. */
+
+/* TODO:
+
+   All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
+   to ensure a 32-bit value is being written.
+
+   FUCOMI(P): what happens to A and S flags?  Currently are forced
+      to zero.
+
+   x87 FP Limitations:
+
+   * all arithmetic done at 64 bits
+
+   * no FP exceptions, except for handling stack over/underflow
+
+   * FP rounding mode observed only for float->int conversions
+     and int->float conversions which could lose accuracy, and
+     for float-to-float rounding.  For all other operations, 
+     round-to-nearest is used, regardless.
+
+   * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
+     simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
+     even when it isn't.
+
+   * some of the FCOM cases could do with testing -- not convinced
+     that the args are the right way round.
+
+   * FSAVE does not re-initialise the FPU; it should do
+
+   * FINIT not only initialises the FPU environment, it also
+     zeroes all the FP registers.  It should leave the registers
+     unchanged.
+
+   SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
+   per Intel docs this bit has no meaning anyway.  Since PUSHF is the
+   only way to observe eflags[1], a proper fix would be to make that
+   bit be set by PUSHF.
+
+   The state of %eflags.AC (alignment check, bit 18) is recorded by
+   the simulation (viz, if you set it with popf then a pushf produces
+   the value you set it to), but it is otherwise ignored.  In
+   particular, setting it to 1 does NOT cause alignment checking to
+   happen.  Programs that set it to 1 and then rely on the resulting
+   SIGBUSs to inform them of misaligned accesses will not work.
+
+   Implementation of sysenter is necessarily partial.  sysenter is a
+   kind of system call entry.  When doing a sysenter, the return
+   address is not known -- that is something that is beyond Vex's
+   knowledge.  So the generated IR forces a return to the scheduler,
+   which can do what it likes to simulate the systenter, but it MUST
+   set this thread's guest_EIP field with the continuation address
+   before resuming execution.  If that doesn't happen, the thread will
+   jump to address zero, which is probably fatal.
+
+   This module uses global variables and so is not MT-safe (if that
+   should ever become relevant).
+
+   The delta values are 32-bit ints, not 64-bit ints.  That means
+   this module may not work right if run on a 64-bit host.  That should
+   be fixed properly, really -- if anyone ever wants to use Vex to
+   translate x86 code for execution on a 64-bit host.
+
+   casLE (implementation of lock-prefixed insns) and rep-prefixed
+   insns: the side-exit back to the start of the insn is done with
+   Ijk_Boring.  This is quite wrong, it should be done with
+   Ijk_NoRedir, since otherwise the side exit, which is intended to
+   restart the instruction for whatever reason, could go somewhere
+   entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
+   no-redir jumps performance critical, at least for rep-prefixed
+   instructions, since all iterations thereof would involve such a
+   jump.  It's not such a big deal with casLE since the side exit is
+   only taken if the CAS fails, that is, the location is contended,
+   which is relatively unlikely.
+
+   XXXX: Nov 2009: handling of SWP on ARM suffers from the same
+   problem.
+
+   Note also, the test for CAS success vs failure is done using
+   Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
+   Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
+   shouldn't definedness-check these comparisons.  See
+   COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
+   background/rationale.
+*/
+
+/* Performance holes:
+
+   - fcom ; fstsw %ax ; sahf
+     sahf does not update the O flag (sigh) and so O needs to
+     be computed.  This is done expensively; it would be better
+     to have a calculate_eflags_o helper.
+
+   - emwarns; some FP codes can generate huge numbers of these
+     if the fpucw is changed in an inner loop.  It would be
+     better for the guest state to have an emwarn-enable reg
+     which can be set zero or nonzero.  If it is zero, emwarns
+     are not flagged, and instead control just flows all the
+     way through bbs as usual.
+*/
+
+/* "Special" instructions.
+
+   This instruction decoder can decode three special instructions
+   which mean nothing natively (are no-ops as far as regs/mem are
+   concerned) but have meaning for supporting Valgrind.  A special
+   instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
+   C1C713 (in the standard interpretation, that means: roll $3, %edi;
+   roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
+   one of the following 3 are allowed (standard interpretation in
+   parentheses):
+
+      87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
+      87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
+      87D2 (xchgl %edx,%edx)   call-noredir *%EAX
+
+   Any other bytes following the 12-byte preamble are illegal and
+   constitute a failure in instruction decoding.  This all assumes
+   that the preamble will never occur except in specific code
+   fragments designed for Valgrind to catch.
+
+   No prefixes may precede a "Special" instruction.
+*/
+
+/* LOCK prefixed instructions.  These are translated using IR-level
+   CAS statements (IRCAS) and are believed to preserve atomicity, even
+   from the point of view of some other process racing against a
+   simulated one (presumably they communicate via a shared memory
+   segment).
+
+   Handlers which are aware of LOCK prefixes are:
+      dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
+      dis_cmpxchg_G_E  (cmpxchg)
+      dis_Grp1         (add, or, adc, sbb, and, sub, xor)
+      dis_Grp3         (not, neg)
+      dis_Grp4         (inc, dec)
+      dis_Grp5         (inc, dec)
+      dis_Grp8_Imm     (bts, btc, btr)
+      dis_bt_G_E       (bts, btc, btr)
+      dis_xadd_G_E     (xadd)
+*/
+
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+#include "libvex_guest_x86.h"
+
+#include "main_util.h"
+#include "main_globals.h"
+#include "guest_generic_bb_to_IR.h"
+#include "guest_generic_x87.h"
+#include "guest_x86_defs.h"
+
+
+/*------------------------------------------------------------*/
+/*--- Globals                                              ---*/
+/*------------------------------------------------------------*/
+
+/* These are set at the start of the translation of an insn, right
+   down in disInstr_X86, so that we don't have to pass them around
+   endlessly.  They are all constant during the translation of any
+   given insn. */
+
+/* We need to know this to do sub-register accesses correctly. */
+static Bool host_is_bigendian;
+
+/* Pointer to the guest code area (points to start of BB, not to the
+   insn being processed). */
+static UChar* guest_code;
+
+/* The guest address corresponding to guest_code[0]. */
+static Addr32 guest_EIP_bbstart;
+
+/* The guest address for the instruction currently being
+   translated. */
+static Addr32 guest_EIP_curr_instr;
+
+/* The IRSB* into which we're generating code. */
+static IRSB* irsb;
+
+
+/*------------------------------------------------------------*/
+/*--- Debugging output                                     ---*/
+/*------------------------------------------------------------*/
+
+#define DIP(format, args...)           \
+   if (vex_traceflags & VEX_TRACE_FE)  \
+      vex_printf(format, ## args)
+
+#define DIS(buf, format, args...)      \
+   if (vex_traceflags & VEX_TRACE_FE)  \
+      vex_sprintf(buf, format, ## args)
+
+
+/*------------------------------------------------------------*/
+/*--- Offsets of various parts of the x86 guest state.     ---*/
+/*------------------------------------------------------------*/
+
+#define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
+#define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
+#define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
+#define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
+#define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
+#define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
+#define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
+#define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
+
+#define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
+
+#define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
+#define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
+#define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
+#define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
+
+#define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
+#define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
+#define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
+#define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
+#define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
+#define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
+#define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
+#define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
+
+#define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
+#define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
+#define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
+#define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
+#define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
+#define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
+#define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
+#define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
+
+#define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
+#define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
+#define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
+#define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
+#define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
+#define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
+#define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
+#define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
+#define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
+
+#define OFFB_EMWARN    offsetof(VexGuestX86State,guest_EMWARN)
+
+#define OFFB_TISTART   offsetof(VexGuestX86State,guest_TISTART)
+#define OFFB_TILEN     offsetof(VexGuestX86State,guest_TILEN)
+#define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
+
+#define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
+
+
+/*------------------------------------------------------------*/
+/*--- Helper bits and pieces for deconstructing the        ---*/
+/*--- x86 insn stream.                                     ---*/
+/*------------------------------------------------------------*/
+
+/* This is the Intel register encoding -- integer regs. */
+#define R_EAX 0
+#define R_ECX 1
+#define R_EDX 2
+#define R_EBX 3
+#define R_ESP 4
+#define R_EBP 5
+#define R_ESI 6
+#define R_EDI 7
+
+#define R_AL (0+R_EAX)
+#define R_AH (4+R_EAX)
+
+/* This is the Intel register encoding -- segment regs. */
+#define R_ES 0
+#define R_CS 1
+#define R_SS 2
+#define R_DS 3
+#define R_FS 4
+#define R_GS 5
+
+
+/* Add a statement to the list held by "irbb". */
+static void stmt ( IRStmt* st )
+{
+   addStmtToIRSB( irsb, st );
+}
+
+/* Generate a new temporary of the given type. */
+static IRTemp newTemp ( IRType ty )
+{
+   vassert(isPlausibleIRType(ty));
+   return newIRTemp( irsb->tyenv, ty );
+}
+
+/* Various simple conversions */
+
+static UInt extend_s_8to32( UInt x )
+{
+   return (UInt)((((Int)x) << 24) >> 24);
+}
+
+static UInt extend_s_16to32 ( UInt x )
+{
+   return (UInt)((((Int)x) << 16) >> 16);
+}
+
+/* Fetch a byte from the guest insn stream. */
+static UChar getIByte ( Int delta )
+{
+   return guest_code[delta];
+}
+
+/* Extract the reg field from a modRM byte. */
+static Int gregOfRM ( UChar mod_reg_rm )
+{
+   return (Int)( (mod_reg_rm >> 3) & 7 );
+}
+
+/* Figure out whether the mod and rm parts of a modRM byte refer to a
+   register or memory.  If so, the byte will have the form 11XXXYYY,
+   where YYY is the register number. */
+static Bool epartIsReg ( UChar mod_reg_rm )
+{
+   return toBool(0xC0 == (mod_reg_rm & 0xC0));
+}
+
+/* ... and extract the register number ... */
+static Int eregOfRM ( UChar mod_reg_rm )
+{
+   return (Int)(mod_reg_rm & 0x7);
+}
+
+/* Get a 8/16/32-bit unsigned value out of the insn stream. */
+
+static UChar getUChar ( Int delta )
+{
+   UChar v = guest_code[delta+0];
+   return toUChar(v);
+}
+
+static UInt getUDisp16 ( Int delta )
+{
+   UInt v = guest_code[delta+1]; v <<= 8;
+   v |= guest_code[delta+0];
+   return v & 0xFFFF;
+}
+
+static UInt getUDisp32 ( Int delta )
+{
+   UInt v = guest_code[delta+3]; v <<= 8;
+   v |= guest_code[delta+2]; v <<= 8;
+   v |= guest_code[delta+1]; v <<= 8;
+   v |= guest_code[delta+0];
+   return v;
+}
+
+static UInt getUDisp ( Int size, Int delta )
+{
+   switch (size) {
+      case 4: return getUDisp32(delta);
+      case 2: return getUDisp16(delta);
+      case 1: return (UInt)getUChar(delta);
+      default: vpanic("getUDisp(x86)");
+   }
+   return 0; /*notreached*/
+}
+
+
+/* Get a byte value out of the insn stream and sign-extend to 32
+   bits. */
+static UInt getSDisp8 ( Int delta )
+{
+   return extend_s_8to32( (UInt) (guest_code[delta]) );
+}
+
+static UInt getSDisp16 ( Int delta0 )
+{
+   UChar* eip = (UChar*)(&guest_code[delta0]);
+   UInt d = *eip++;
+   d |= ((*eip++) << 8);
+   return extend_s_16to32(d);
+}
+
+static UInt getSDisp ( Int size, Int delta )
+{
+   switch (size) {
+      case 4: return getUDisp32(delta);
+      case 2: return getSDisp16(delta);
+      case 1: return getSDisp8(delta);
+      default: vpanic("getSDisp(x86)");
+  }
+  return 0; /*notreached*/
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for constructing IR.                         ---*/
+/*------------------------------------------------------------*/
+
+/* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
+   register references, we need to take the host endianness into
+   account.  Supplied value is 0 .. 7 and in the Intel instruction
+   encoding. */
+
+static IRType szToITy ( Int n )
+{
+   switch (n) {
+      case 1: return Ity_I8;
+      case 2: return Ity_I16;
+      case 4: return Ity_I32;
+      default: vpanic("szToITy(x86)");
+   }
+}
+
+/* On a little-endian host, less significant bits of the guest
+   registers are at lower addresses.  Therefore, if a reference to a
+   register low half has the safe guest state offset as a reference to
+   the full register.
+*/
+static Int integerGuestRegOffset ( Int sz, UInt archreg )
+{
+   vassert(archreg < 8);
+
+   /* Correct for little-endian host only. */
+   vassert(!host_is_bigendian);
+
+   if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
+      switch (archreg) {
+         case R_EAX: return OFFB_EAX;
+         case R_EBX: return OFFB_EBX;
+         case R_ECX: return OFFB_ECX;
+         case R_EDX: return OFFB_EDX;
+         case R_ESI: return OFFB_ESI;
+         case R_EDI: return OFFB_EDI;
+         case R_ESP: return OFFB_ESP;
+         case R_EBP: return OFFB_EBP;
+         default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
+      }
+   }
+
+   vassert(archreg >= 4 && archreg < 8 && sz == 1);
+   switch (archreg-4) {
+      case R_EAX: return 1+ OFFB_EAX;
+      case R_EBX: return 1+ OFFB_EBX;
+      case R_ECX: return 1+ OFFB_ECX;
+      case R_EDX: return 1+ OFFB_EDX;
+      default: vpanic("integerGuestRegOffset(x86,le)(1h)");
+   }
+
+   /* NOTREACHED */
+   vpanic("integerGuestRegOffset(x86,le)");
+}
+
+static Int segmentGuestRegOffset ( UInt sreg )
+{
+   switch (sreg) {
+      case R_ES: return OFFB_ES;
+      case R_CS: return OFFB_CS;
+      case R_SS: return OFFB_SS;
+      case R_DS: return OFFB_DS;
+      case R_FS: return OFFB_FS;
+      case R_GS: return OFFB_GS;
+      default: vpanic("segmentGuestRegOffset(x86)");
+   }
+}
+
+static Int xmmGuestRegOffset ( UInt xmmreg )
+{
+   switch (xmmreg) {
+      case 0: return OFFB_XMM0;
+      case 1: return OFFB_XMM1;
+      case 2: return OFFB_XMM2;
+      case 3: return OFFB_XMM3;
+      case 4: return OFFB_XMM4;
+      case 5: return OFFB_XMM5;
+      case 6: return OFFB_XMM6;
+      case 7: return OFFB_XMM7;
+      default: vpanic("xmmGuestRegOffset");
+   }
+}
+
+/* Lanes of vector registers are always numbered from zero being the
+   least significant lane (rightmost in the register).  */
+
+static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
+{
+   /* Correct for little-endian host only. */
+   vassert(!host_is_bigendian);
+   vassert(laneno >= 0 && laneno < 8);
+   return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
+}
+
+static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
+{
+   /* Correct for little-endian host only. */
+   vassert(!host_is_bigendian);
+   vassert(laneno >= 0 && laneno < 4);
+   return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
+}
+
+static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
+{
+   /* Correct for little-endian host only. */
+   vassert(!host_is_bigendian);
+   vassert(laneno >= 0 && laneno < 2);
+   return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
+}
+
+static IRExpr* getIReg ( Int sz, UInt archreg )
+{
+   vassert(sz == 1 || sz == 2 || sz == 4);
+   vassert(archreg < 8);
+   return IRExpr_Get( integerGuestRegOffset(sz,archreg),
+                      szToITy(sz) );
+}
+
+/* Ditto, but write to a reg instead. */
+static void putIReg ( Int sz, UInt archreg, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(irsb->tyenv, e);
+   switch (sz) {
+      case 1: vassert(ty == Ity_I8); break;
+      case 2: vassert(ty == Ity_I16); break;
+      case 4: vassert(ty == Ity_I32); break;
+      default: vpanic("putIReg(x86)");
+   }
+   vassert(archreg < 8);
+   stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
+}
+
+static IRExpr* getSReg ( UInt sreg )
+{
+   return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
+}
+
+static void putSReg ( UInt sreg, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
+   stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
+}
+
+static IRExpr* getXMMReg ( UInt xmmreg )
+{
+   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
+}
+
+static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
+{
+   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
+}
+
+static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
+{
+   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
+}
+
+static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
+{
+   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
+}
+
+static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
+{
+   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
+}
+
+static void putXMMReg ( UInt xmmreg, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
+   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
+}
+
+static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
+   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
+}
+
+static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
+   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
+}
+
+static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
+   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
+}
+
+static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
+   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
+}
+
+static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
+   stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
+}
+
+static void assign ( IRTemp dst, IRExpr* e )
+{
+   stmt( IRStmt_WrTmp(dst, e) );
+}
+
+static void storeLE ( IRExpr* addr, IRExpr* data )
+{
+   stmt( IRStmt_Store(Iend_LE, addr, data) );
+}
+
+static IRExpr* unop ( IROp op, IRExpr* a )
+{
+   return IRExpr_Unop(op, a);
+}
+
+static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
+{
+   return IRExpr_Binop(op, a1, a2);
+}
+
+static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
+{
+   return IRExpr_Triop(op, a1, a2, a3);
+}
+
+static IRExpr* mkexpr ( IRTemp tmp )
+{
+   return IRExpr_RdTmp(tmp);
+}
+
+static IRExpr* mkU8 ( UInt i )
+{
+   vassert(i < 256);
+   return IRExpr_Const(IRConst_U8( (UChar)i ));
+}
+
+static IRExpr* mkU16 ( UInt i )
+{
+   vassert(i < 65536);
+   return IRExpr_Const(IRConst_U16( (UShort)i ));
+}
+
+static IRExpr* mkU32 ( UInt i )
+{
+   return IRExpr_Const(IRConst_U32(i));
+}
+
+static IRExpr* mkU64 ( ULong i )
+{
+   return IRExpr_Const(IRConst_U64(i));
+}
+
+static IRExpr* mkU ( IRType ty, UInt i )
+{
+   if (ty == Ity_I8)  return mkU8(i);
+   if (ty == Ity_I16) return mkU16(i);
+   if (ty == Ity_I32) return mkU32(i);
+   /* If this panics, it usually means you passed a size (1,2,4)
+      value as the IRType, rather than a real IRType. */
+   vpanic("mkU(x86)");
+}
+
+static IRExpr* mkV128 ( UShort mask )
+{
+   return IRExpr_Const(IRConst_V128(mask));
+}
+
+static IRExpr* loadLE ( IRType ty, IRExpr* addr )
+{
+   return IRExpr_Load(Iend_LE, ty, addr);
+}
+
+static IROp mkSizedOp ( IRType ty, IROp op8 )
+{
+   Int adj;
+   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
+   vassert(op8 == Iop_Add8 || op8 == Iop_Sub8 
+           || op8 == Iop_Mul8 
+           || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
+           || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
+           || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
+           || op8 == Iop_CasCmpNE8
+           || op8 == Iop_Not8);
+   adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
+   return adj + op8;
+}
+
+static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
+{
+   if (szSmall == 1 && szBig == 4) {
+      return signd ? Iop_8Sto32 : Iop_8Uto32;
+   }
+   if (szSmall == 1 && szBig == 2) {
+      return signd ? Iop_8Sto16 : Iop_8Uto16;
+   }
+   if (szSmall == 2 && szBig == 4) {
+      return signd ? Iop_16Sto32 : Iop_16Uto32;
+   }
+   vpanic("mkWidenOp(x86,guest)");
+}
+
+static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
+{
+   vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
+   vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
+   return unop(Iop_32to1, 
+               binop(Iop_And32, 
+                     unop(Iop_1Uto32,x), 
+                     unop(Iop_1Uto32,y)));
+}
+
+/* Generate a compare-and-swap operation, operating on memory at
+   'addr'.  The expected value is 'expVal' and the new value is
+   'newVal'.  If the operation fails, then transfer control (with a
+   no-redir jump (XXX no -- see comment at top of this file)) to
+   'restart_point', which is presumably the address of the guest
+   instruction again -- retrying, essentially. */
+static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
+                    Addr32 restart_point )
+{
+   IRCAS* cas;
+   IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
+   IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
+   IRTemp oldTmp = newTemp(tyE);
+   IRTemp expTmp = newTemp(tyE);
+   vassert(tyE == tyN);
+   vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
+   assign(expTmp, expVal);
+   cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr, 
+                  NULL, mkexpr(expTmp), NULL, newVal );
+   stmt( IRStmt_CAS(cas) );
+   stmt( IRStmt_Exit(
+            binop( mkSizedOp(tyE,Iop_CasCmpNE8),
+                   mkexpr(oldTmp), mkexpr(expTmp) ),
+            Ijk_Boring, /*Ijk_NoRedir*/
+            IRConst_U32( restart_point ) 
+         ));
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Helpers for %eflags.                                 ---*/
+/*------------------------------------------------------------*/
+
+/* -------------- Evaluating the flags-thunk. -------------- */
+
+/* Build IR to calculate all the eflags from stored
+   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
+   Ity_I32. */
+static IRExpr* mk_x86g_calculate_eflags_all ( void )
+{
+   IRExpr** args
+      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
+                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
+                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
+                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
+   IRExpr* call
+      = mkIRExprCCall(
+           Ity_I32,
+           0/*regparm*/, 
+           "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
+           args
+        );
+   /* Exclude OP and NDEP from definedness checking.  We're only
+      interested in DEP1 and DEP2. */
+   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
+   return call;
+}
+
+/* Build IR to calculate some particular condition from stored
+   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
+   Ity_Bit. */
+static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
+{
+   IRExpr** args
+      = mkIRExprVec_5( mkU32(cond),
+                       IRExpr_Get(OFFB_CC_OP,  Ity_I32),
+                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
+                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
+                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
+   IRExpr* call
+      = mkIRExprCCall(
+           Ity_I32,
+           0/*regparm*/, 
+           "x86g_calculate_condition", &x86g_calculate_condition,
+           args
+        );
+   /* Exclude the requested condition, OP and NDEP from definedness
+      checking.  We're only interested in DEP1 and DEP2. */
+   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
+   return unop(Iop_32to1, call);
+}
+
+/* Build IR to calculate just the carry flag from stored
+   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
+static IRExpr* mk_x86g_calculate_eflags_c ( void )
+{
+   IRExpr** args
+      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
+                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
+                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
+                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
+   IRExpr* call
+      = mkIRExprCCall(
+           Ity_I32,
+           3/*regparm*/, 
+           "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
+           args
+        );
+   /* Exclude OP and NDEP from definedness checking.  We're only
+      interested in DEP1 and DEP2. */
+   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
+   return call;
+}
+
+
+/* -------------- Building the flags-thunk. -------------- */
+
+/* The machinery in this section builds the flag-thunk following a
+   flag-setting operation.  Hence the various setFlags_* functions.
+*/
+
+static Bool isAddSub ( IROp op8 )
+{
+   return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
+}
+
+static Bool isLogic ( IROp op8 )
+{
+   return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
+}
+
+/* U-widen 8/16/32 bit int expr to 32. */
+static IRExpr* widenUto32 ( IRExpr* e )
+{
+   switch (typeOfIRExpr(irsb->tyenv,e)) {
+      case Ity_I32: return e;
+      case Ity_I16: return unop(Iop_16Uto32,e);
+      case Ity_I8:  return unop(Iop_8Uto32,e);
+      default: vpanic("widenUto32");
+   }
+}
+
+/* S-widen 8/16/32 bit int expr to 32. */
+static IRExpr* widenSto32 ( IRExpr* e )
+{
+   switch (typeOfIRExpr(irsb->tyenv,e)) {
+      case Ity_I32: return e;
+      case Ity_I16: return unop(Iop_16Sto32,e);
+      case Ity_I8:  return unop(Iop_8Sto32,e);
+      default: vpanic("widenSto32");
+   }
+}
+
+/* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
+   of these combinations make sense. */
+static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
+{
+   IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
+   if (src_ty == dst_ty)
+      return e;
+   if (src_ty == Ity_I32 && dst_ty == Ity_I16)
+      return unop(Iop_32to16, e);
+   if (src_ty == Ity_I32 && dst_ty == Ity_I8)
+      return unop(Iop_32to8, e);
+
+   vex_printf("\nsrc, dst tys are: ");
+   ppIRType(src_ty);
+   vex_printf(", ");
+   ppIRType(dst_ty);
+   vex_printf("\n");
+   vpanic("narrowTo(x86)");
+}
+
+
+/* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
+   auto-sized up to the real op. */
+
+static 
+void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
+{
+   Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
+
+   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
+
+   switch (op8) {
+      case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
+      case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
+      default:       ppIROp(op8);
+                     vpanic("setFlags_DEP1_DEP2(x86)");
+   }
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
+   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+}
+
+
+/* Set the OP and DEP1 fields only, and write zero to DEP2. */
+
+static 
+void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
+{
+   Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
+
+   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
+
+   switch (op8) {
+      case Iop_Or8:
+      case Iop_And8:
+      case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
+      default:       ppIROp(op8);
+                     vpanic("setFlags_DEP1(x86)");
+   }
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+}
+
+
+/* For shift operations, we put in the result and the undershifted
+   result.  Except if the shift amount is zero, the thunk is left
+   unchanged. */
+
+static void setFlags_DEP1_DEP2_shift ( IROp    op32,
+                                       IRTemp  res,
+                                       IRTemp  resUS,
+                                       IRType  ty,
+                                       IRTemp  guard )
+{
+   Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
+
+   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
+   vassert(guard);
+
+   /* Both kinds of right shifts are handled by the same thunk
+      operation. */
+   switch (op32) {
+      case Iop_Shr32:
+      case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
+      case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
+      default:        ppIROp(op32);
+                      vpanic("setFlags_DEP1_DEP2_shift(x86)");
+   }
+
+   /* DEP1 contains the result, DEP2 contains the undershifted value. */
+   stmt( IRStmt_Put( OFFB_CC_OP,
+                     IRExpr_Mux0X( mkexpr(guard),
+                                   IRExpr_Get(OFFB_CC_OP,Ity_I32),
+                                   mkU32(ccOp))) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1,
+                     IRExpr_Mux0X( mkexpr(guard),
+                                   IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
+                                   widenUto32(mkexpr(res)))) );
+   stmt( IRStmt_Put( OFFB_CC_DEP2, 
+                     IRExpr_Mux0X( mkexpr(guard),
+                                   IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
+                                   widenUto32(mkexpr(resUS)))) );
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+}
+
+
+/* For the inc/dec case, we store in DEP1 the result value and in NDEP
+   the former value of the carry flag, which unfortunately we have to
+   compute. */
+
+static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
+{
+   Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
+   
+   ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
+   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
+
+   /* This has to come first, because calculating the C flag 
+      may require reading all four thunk fields. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
+}
+
+
+/* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
+   two arguments. */
+
+static
+void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
+{
+   switch (ty) {
+      case Ity_I8:
+         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
+         break;
+      case Ity_I16:
+         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
+         break;
+      case Ity_I32:
+         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
+         break;
+      default:
+         vpanic("setFlags_MUL(x86)");
+   }
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+}
+
+
+/* -------------- Condition codes. -------------- */
+
+/* Condition codes, using the Intel encoding.  */
+
+static HChar* name_X86Condcode ( X86Condcode cond )
+{
+   switch (cond) {
+      case X86CondO:      return "o";
+      case X86CondNO:     return "no";
+      case X86CondB:      return "b";
+      case X86CondNB:     return "nb";
+      case X86CondZ:      return "z";
+      case X86CondNZ:     return "nz";
+      case X86CondBE:     return "be";
+      case X86CondNBE:    return "nbe";
+      case X86CondS:      return "s";
+      case X86CondNS:     return "ns";
+      case X86CondP:      return "p";
+      case X86CondNP:     return "np";
+      case X86CondL:      return "l";
+      case X86CondNL:     return "nl";
+      case X86CondLE:     return "le";
+      case X86CondNLE:    return "nle";
+      case X86CondAlways: return "ALWAYS";
+      default: vpanic("name_X86Condcode");
+   }
+}
+
+static 
+X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
+                                      Bool*        needInvert )
+{
+   vassert(cond >= X86CondO && cond <= X86CondNLE);
+   if (cond & 1) {
+      *needInvert = True;
+      return cond-1;
+   } else {
+      *needInvert = False;
+      return cond;
+   }
+}
+
+
+/* -------------- Helpers for ADD/SUB with carry. -------------- */
+
+/* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
+   appropriately.
+
+   Optionally, generate a store for the 'tres' value.  This can either
+   be a normal store, or it can be a cas-with-possible-failure style
+   store:
+
+   if taddr is IRTemp_INVALID, then no store is generated.
+
+   if taddr is not IRTemp_INVALID, then a store (using taddr as
+   the address) is generated:
+
+     if texpVal is IRTemp_INVALID then a normal store is
+     generated, and restart_point must be zero (it is irrelevant).
+
+     if texpVal is not IRTemp_INVALID then a cas-style store is
+     generated.  texpVal is the expected value, restart_point
+     is the restart point if the store fails, and texpVal must
+     have the same type as tres.   
+*/
+static void helper_ADC ( Int sz,
+                         IRTemp tres, IRTemp ta1, IRTemp ta2,
+                         /* info about optional store: */
+                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
+{
+   UInt    thunkOp;
+   IRType  ty    = szToITy(sz);
+   IRTemp  oldc  = newTemp(Ity_I32);
+   IRTemp  oldcn = newTemp(ty);
+   IROp    plus  = mkSizedOp(ty, Iop_Add8);
+   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
+
+   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
+   vassert(sz == 1 || sz == 2 || sz == 4);
+   thunkOp = sz==4 ? X86G_CC_OP_ADCL 
+                   : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
+
+   /* oldc = old carry flag, 0 or 1 */
+   assign( oldc,  binop(Iop_And32,
+                        mk_x86g_calculate_eflags_c(),
+                        mkU32(1)) );
+
+   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
+
+   assign( tres, binop(plus,
+                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
+                       mkexpr(oldcn)) );
+
+   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
+      start of this function. */
+   if (taddr != IRTemp_INVALID) {
+      if (texpVal == IRTemp_INVALID) {
+         vassert(restart_point == 0);
+         storeLE( mkexpr(taddr), mkexpr(tres) );
+      } else {
+         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
+         /* .. and hence 'texpVal' has the same type as 'tres'. */
+         casLE( mkexpr(taddr),
+                mkexpr(texpVal), mkexpr(tres), restart_point );
+      }
+   }
+
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2), 
+                                                         mkexpr(oldcn)) )) );
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
+}
+
+
+/* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
+   appropriately.  As with helper_ADC, possibly generate a store of
+   the result -- see comments on helper_ADC for details.
+*/
+static void helper_SBB ( Int sz,
+                         IRTemp tres, IRTemp ta1, IRTemp ta2,
+                         /* info about optional store: */
+                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
+{
+   UInt    thunkOp;
+   IRType  ty    = szToITy(sz);
+   IRTemp  oldc  = newTemp(Ity_I32);
+   IRTemp  oldcn = newTemp(ty);
+   IROp    minus = mkSizedOp(ty, Iop_Sub8);
+   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
+
+   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
+   vassert(sz == 1 || sz == 2 || sz == 4);
+   thunkOp = sz==4 ? X86G_CC_OP_SBBL 
+                   : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
+
+   /* oldc = old carry flag, 0 or 1 */
+   assign( oldc, binop(Iop_And32,
+                       mk_x86g_calculate_eflags_c(),
+                       mkU32(1)) );
+
+   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
+
+   assign( tres, binop(minus,
+                       binop(minus,mkexpr(ta1),mkexpr(ta2)),
+                       mkexpr(oldcn)) );
+
+   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
+      start of this function. */
+   if (taddr != IRTemp_INVALID) {
+      if (texpVal == IRTemp_INVALID) {
+         vassert(restart_point == 0);
+         storeLE( mkexpr(taddr), mkexpr(tres) );
+      } else {
+         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
+         /* .. and hence 'texpVal' has the same type as 'tres'. */
+         casLE( mkexpr(taddr),
+                mkexpr(texpVal), mkexpr(tres), restart_point );
+      }
+   }
+
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
+   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
+   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2), 
+                                                         mkexpr(oldcn)) )) );
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
+}
+
+
+/* -------------- Helpers for disassembly printing. -------------- */
+
+static HChar* nameGrp1 ( Int opc_aux )
+{
+   static HChar* grp1_names[8] 
+     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
+   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
+   return grp1_names[opc_aux];
+}
+
+static HChar* nameGrp2 ( Int opc_aux )
+{
+   static HChar* grp2_names[8] 
+     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
+   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
+   return grp2_names[opc_aux];
+}
+
+static HChar* nameGrp4 ( Int opc_aux )
+{
+   static HChar* grp4_names[8] 
+     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
+   if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
+   return grp4_names[opc_aux];
+}
+
+static HChar* nameGrp5 ( Int opc_aux )
+{
+   static HChar* grp5_names[8] 
+     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
+   if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
+   return grp5_names[opc_aux];
+}
+
+static HChar* nameGrp8 ( Int opc_aux )
+{
+   static HChar* grp8_names[8] 
+     = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
+   if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
+   return grp8_names[opc_aux];
+}
+
+static HChar* nameIReg ( Int size, Int reg )
+{
+   static HChar* ireg32_names[8] 
+     = { "%eax", "%ecx", "%edx", "%ebx", 
+         "%esp", "%ebp", "%esi", "%edi" };
+   static HChar* ireg16_names[8] 
+     = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
+   static HChar* ireg8_names[8] 
+     = { "%al", "%cl", "%dl", "%bl", 
+         "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
+   if (reg < 0 || reg > 7) goto bad;
+   switch (size) {
+      case 4: return ireg32_names[reg];
+      case 2: return ireg16_names[reg];
+      case 1: return ireg8_names[reg];
+   }
+  bad:
+   vpanic("nameIReg(X86)");
+   return NULL; /*notreached*/
+}
+
+static HChar* nameSReg ( UInt sreg )
+{
+   switch (sreg) {
+      case R_ES: return "%es";
+      case R_CS: return "%cs";
+      case R_SS: return "%ss";
+      case R_DS: return "%ds";
+      case R_FS: return "%fs";
+      case R_GS: return "%gs";
+      default: vpanic("nameSReg(x86)");
+   }
+}
+
+static HChar* nameMMXReg ( Int mmxreg )
+{
+   static HChar* mmx_names[8] 
+     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
+   if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
+   return mmx_names[mmxreg];
+}
+
+static HChar* nameXMMReg ( Int xmmreg )
+{
+   static HChar* xmm_names[8] 
+     = { "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
+         "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
+   if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
+   return xmm_names[xmmreg];
+}
+ 
+static HChar* nameMMXGran ( Int gran )
+{
+   switch (gran) {
+      case 0: return "b";
+      case 1: return "w";
+      case 2: return "d";
+      case 3: return "q";
+      default: vpanic("nameMMXGran(x86,guest)");
+   }
+}
+
+static HChar nameISize ( Int size )
+{
+   switch (size) {
+      case 4: return 'l';
+      case 2: return 'w';
+      case 1: return 'b';
+      default: vpanic("nameISize(x86)");
+   }
+}
+
+
+/*------------------------------------------------------------*/
+/*--- JMP helpers                                          ---*/
+/*------------------------------------------------------------*/
+
+static void jmp_lit( IRJumpKind kind, Addr32 d32 )
+{
+   irsb->next     = mkU32(d32);
+   irsb->jumpkind = kind;
+}
+
+static void jmp_treg( IRJumpKind kind, IRTemp t )
+{
+   irsb->next = mkexpr(t);
+   irsb->jumpkind = kind;
+}
+
+static 
+void jcc_01( X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
+{
+   Bool        invert;
+   X86Condcode condPos;
+   condPos = positiveIse_X86Condcode ( cond, &invert );
+   if (invert) {
+      stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
+                         Ijk_Boring,
+                         IRConst_U32(d32_false) ) );
+      irsb->next     = mkU32(d32_true);
+      irsb->jumpkind = Ijk_Boring;
+   } else {
+      stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
+                         Ijk_Boring,
+                         IRConst_U32(d32_true) ) );
+      irsb->next     = mkU32(d32_false);
+      irsb->jumpkind = Ijk_Boring;
+   }
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Disassembling addressing modes                       ---*/
+/*------------------------------------------------------------*/
+
+static 
+HChar* sorbTxt ( UChar sorb )
+{
+   switch (sorb) {
+      case 0:    return ""; /* no override */
+      case 0x3E: return "%ds";
+      case 0x26: return "%es:";
+      case 0x64: return "%fs:";
+      case 0x65: return "%gs:";
+      default: vpanic("sorbTxt(x86,guest)");
+   }
+}
+
+
+/* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
+   linear address by adding any required segment override as indicated
+   by sorb. */
+static
+IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
+{
+   Int    sreg;
+   IRType hWordTy;
+   IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
+
+   if (sorb == 0)
+      /* the common case - no override */
+      return virtual;
+
+   switch (sorb) {
+      case 0x3E: sreg = R_DS; break;
+      case 0x26: sreg = R_ES; break;
+      case 0x64: sreg = R_FS; break;
+      case 0x65: sreg = R_GS; break;
+      default: vpanic("handleSegOverride(x86,guest)");
+   }
+
+   hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
+
+   seg_selector = newTemp(Ity_I32);
+   ldt_ptr      = newTemp(hWordTy);
+   gdt_ptr      = newTemp(hWordTy);
+   r64          = newTemp(Ity_I64);
+
+   assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
+   assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
+   assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
+
+   /*
+   Call this to do the translation and limit checks: 
+   ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
+                                 UInt seg_selector, UInt virtual_addr )
+   */
+   assign( 
+      r64, 
+      mkIRExprCCall( 
+         Ity_I64, 
+         0/*regparms*/, 
+         "x86g_use_seg_selector", 
+         &x86g_use_seg_selector, 
+         mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr), 
+                        mkexpr(seg_selector), virtual)
+      )
+   );
+
+   /* If the high 32 of the result are non-zero, there was a 
+      failure in address translation.  In which case, make a
+      quick exit.
+   */
+   stmt( 
+      IRStmt_Exit(
+         binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
+         Ijk_MapFail,
+         IRConst_U32( guest_EIP_curr_instr )
+      )
+   );
+
+   /* otherwise, here's the translated result. */
+   return unop(Iop_64to32, mkexpr(r64));
+}
+
+
+/* Generate IR to calculate an address indicated by a ModRM and
+   following SIB bytes.  The expression, and the number of bytes in
+   the address mode, are returned.  Note that this fn should not be
+   called if the R/M part of the address denotes a register instead of
+   memory.  If print_codegen is true, text of the addressing mode is
+   placed in buf. 
+
+   The computed address is stored in a new tempreg, and the
+   identity of the tempreg is returned.  */
+
+static IRTemp disAMode_copy2tmp ( IRExpr* addr32 )
+{
+   IRTemp tmp = newTemp(Ity_I32);
+   assign( tmp, addr32 );
+   return tmp;
+}
+
+static 
+IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf )
+{
+   UChar mod_reg_rm = getIByte(delta);
+   delta++;
+
+   buf[0] = (UChar)0;
+
+   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
+      jump table seems a bit excessive. 
+   */
+   mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
+   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));  
+                                            /* is now XX0XXYYY */
+   mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
+   switch (mod_reg_rm) {
+
+      /* (%eax) .. (%edi), not including (%esp) or (%ebp).
+         --> GET %reg, t 
+      */
+      case 0x00: case 0x01: case 0x02: case 0x03: 
+      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
+         { UChar rm = mod_reg_rm;
+           DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
+           *len = 1;
+           return disAMode_copy2tmp(
+                  handleSegOverride(sorb, getIReg(4,rm)));
+         }
+
+      /* d8(%eax) ... d8(%edi), not including d8(%esp) 
+         --> GET %reg, t ; ADDL d8, t
+      */
+      case 0x08: case 0x09: case 0x0A: case 0x0B: 
+      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
+         { UChar rm = toUChar(mod_reg_rm & 7);
+           UInt  d  = getSDisp8(delta);
+           DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
+           *len = 2;
+           return disAMode_copy2tmp(
+                  handleSegOverride(sorb,
+                     binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
+         }
+
+      /* d32(%eax) ... d32(%edi), not including d32(%esp)
+         --> GET %reg, t ; ADDL d8, t
+      */
+      case 0x10: case 0x11: case 0x12: case 0x13: 
+      /* ! 14 */ case 0x15: case 0x16: case 0x17:
+         { UChar rm = toUChar(mod_reg_rm & 7);
+           UInt  d  = getUDisp32(delta);
+           DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
+           *len = 5;
+           return disAMode_copy2tmp(
+                  handleSegOverride(sorb,
+                     binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
+         }
+
+      /* a register, %eax .. %edi.  This shouldn't happen. */
+      case 0x18: case 0x19: case 0x1A: case 0x1B:
+      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
+         vpanic("disAMode(x86): not an addr!");
+
+      /* a 32-bit literal address
+         --> MOV d32, tmp 
+      */
+      case 0x05: 
+         { UInt d = getUDisp32(delta);
+           *len = 5;
+           DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
+           return disAMode_copy2tmp( 
+                     handleSegOverride(sorb, mkU32(d)));
+         }
+
+      case 0x04: {
+         /* SIB, with no displacement.  Special cases:
+            -- %esp cannot act as an index value.  
+               If index_r indicates %esp, zero is used for the index.
+            -- when mod is zero and base indicates EBP, base is instead
+               a 32-bit literal.
+            It's all madness, I tell you.  Extract %index, %base and 
+            scale from the SIB byte.  The value denoted is then:
+               | %index == %ESP && %base == %EBP
+               = d32 following SIB byte
+               | %index == %ESP && %base != %EBP
+               = %base
+               | %index != %ESP && %base == %EBP
+               = d32 following SIB byte + (%index << scale)
+               | %index != %ESP && %base != %ESP
+               = %base + (%index << scale)
+
+            What happens to the souls of CPU architects who dream up such
+            horrendous schemes, do you suppose?  
+         */
+         UChar sib     = getIByte(delta);
+         UChar scale   = toUChar((sib >> 6) & 3);
+         UChar index_r = toUChar((sib >> 3) & 7);
+         UChar base_r  = toUChar(sib & 7);
+         delta++;
+
+         if (index_r != R_ESP && base_r != R_EBP) {
+            DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb), 
+                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
+            *len = 2;
+            return
+               disAMode_copy2tmp( 
+               handleSegOverride(sorb,
+                  binop(Iop_Add32, 
+                        getIReg(4,base_r),
+                        binop(Iop_Shl32, getIReg(4,index_r),
+                              mkU8(scale)))));
+         }
+
+         if (index_r != R_ESP && base_r == R_EBP) {
+            UInt d = getUDisp32(delta);
+            DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d, 
+                      nameIReg(4,index_r), 1<<scale);
+            *len = 6;
+            return
+               disAMode_copy2tmp(
+               handleSegOverride(sorb, 
+                  binop(Iop_Add32,
+                        binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
+                        mkU32(d))));
+         }
+
+         if (index_r == R_ESP && base_r != R_EBP) {
+            DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
+            *len = 2;
+            return disAMode_copy2tmp(
+                   handleSegOverride(sorb, getIReg(4,base_r)));
+         }
+
+         if (index_r == R_ESP && base_r == R_EBP) {
+            UInt d = getUDisp32(delta);
+            DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
+            *len = 6;
+            return disAMode_copy2tmp(
+                   handleSegOverride(sorb, mkU32(d)));
+         }
+         /*NOTREACHED*/
+         vassert(0);
+      }
+
+      /* SIB, with 8-bit displacement.  Special cases:
+         -- %esp cannot act as an index value.  
+            If index_r indicates %esp, zero is used for the index.
+         Denoted value is:
+            | %index == %ESP
+            = d8 + %base
+            | %index != %ESP
+            = d8 + %base + (%index << scale)
+      */
+      case 0x0C: {
+         UChar sib     = getIByte(delta);
+         UChar scale   = toUChar((sib >> 6) & 3);
+         UChar index_r = toUChar((sib >> 3) & 7);
+         UChar base_r  = toUChar(sib & 7);
+         UInt  d       = getSDisp8(delta+1);
+
+         if (index_r == R_ESP) {
+            DIS(buf, "%s%d(%s,,)", sorbTxt(sorb), 
+                                   (Int)d, nameIReg(4,base_r));
+            *len = 3;
+            return disAMode_copy2tmp(
+                   handleSegOverride(sorb, 
+                      binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
+         } else {
+            DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d, 
+                     nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
+            *len = 3;
+            return 
+                disAMode_copy2tmp(
+                handleSegOverride(sorb,
+                  binop(Iop_Add32,
+                        binop(Iop_Add32, 
+                              getIReg(4,base_r), 
+                              binop(Iop_Shl32, 
+                                    getIReg(4,index_r), mkU8(scale))),
+                        mkU32(d))));
+         }
+	 /*NOTREACHED*/
+         vassert(0);
+      }
+
+      /* SIB, with 32-bit displacement.  Special cases:
+         -- %esp cannot act as an index value.  
+            If index_r indicates %esp, zero is used for the index.
+         Denoted value is:
+            | %index == %ESP
+            = d32 + %base
+            | %index != %ESP
+            = d32 + %base + (%index << scale)
+      */
+      case 0x14: {
+         UChar sib     = getIByte(delta);
+         UChar scale   = toUChar((sib >> 6) & 3);
+         UChar index_r = toUChar((sib >> 3) & 7);
+         UChar base_r  = toUChar(sib & 7);
+         UInt d        = getUDisp32(delta+1);
+
+         if (index_r == R_ESP) {
+            DIS(buf, "%s%d(%s,,)", sorbTxt(sorb), 
+                                   (Int)d, nameIReg(4,base_r));
+            *len = 6;
+            return disAMode_copy2tmp(
+                   handleSegOverride(sorb, 
+                      binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
+         } else {
+            DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d, 
+                     nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
+            *len = 6;
+            return 
+                disAMode_copy2tmp(
+                handleSegOverride(sorb,
+                  binop(Iop_Add32,
+                        binop(Iop_Add32, 
+                              getIReg(4,base_r), 
+                              binop(Iop_Shl32, 
+                                    getIReg(4,index_r), mkU8(scale))),
+                        mkU32(d))));
+         }
+	 /*NOTREACHED*/
+         vassert(0);
+      }
+
+      default:
+         vpanic("disAMode(x86)");
+         return 0; /*notreached*/
+   }
+}
+
+
+/* Figure out the number of (insn-stream) bytes constituting the amode
+   beginning at delta.  Is useful for getting hold of literals beyond
+   the end of the amode before it has been disassembled.  */
+
+static UInt lengthAMode ( Int delta )
+{
+   UChar mod_reg_rm = getIByte(delta); delta++;
+
+   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
+      jump table seems a bit excessive. 
+   */
+   mod_reg_rm &= 0xC7;               /* is now XX000YYY */
+   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));  
+                                     /* is now XX0XXYYY */
+   mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
+   switch (mod_reg_rm) {
+
+      /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
+      case 0x00: case 0x01: case 0x02: case 0x03: 
+      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
+         return 1;
+
+      /* d8(%eax) ... d8(%edi), not including d8(%esp). */ 
+      case 0x08: case 0x09: case 0x0A: case 0x0B: 
+      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
+         return 2;
+
+      /* d32(%eax) ... d32(%edi), not including d32(%esp). */
+      case 0x10: case 0x11: case 0x12: case 0x13: 
+      /* ! 14 */ case 0x15: case 0x16: case 0x17:
+         return 5;
+
+      /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
+      case 0x18: case 0x19: case 0x1A: case 0x1B:
+      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
+         return 1;
+
+      /* a 32-bit literal address. */
+      case 0x05: return 5;
+
+      /* SIB, no displacement.  */
+      case 0x04: {
+         UChar sib    = getIByte(delta);
+         UChar base_r = toUChar(sib & 7);
+         if (base_r == R_EBP) return 6; else return 2;
+      }
+      /* SIB, with 8-bit displacement.  */
+      case 0x0C: return 3;
+
+      /* SIB, with 32-bit displacement.  */
+      case 0x14: return 6;
+
+      default:
+         vpanic("lengthAMode");
+         return 0; /*notreached*/
+   }
+}
+
+/*------------------------------------------------------------*/
+/*--- Disassembling common idioms                          ---*/
+/*------------------------------------------------------------*/
+
+/* Handle binary integer instructions of the form
+      op E, G  meaning
+      op reg-or-mem, reg
+   Is passed the a ptr to the modRM byte, the actual operation, and the
+   data size.  Returns the address advanced completely over this
+   instruction.
+
+   E(src) is reg-or-mem
+   G(dst) is reg.
+
+   If E is reg, -->    GET %G,  tmp
+                       OP %E,   tmp
+                       PUT tmp, %G
+ 
+   If E is mem and OP is not reversible, 
+                -->    (getAddr E) -> tmpa
+                       LD (tmpa), tmpa
+                       GET %G, tmp2
+                       OP tmpa, tmp2
+                       PUT tmp2, %G
+
+   If E is mem and OP is reversible
+                -->    (getAddr E) -> tmpa
+                       LD (tmpa), tmpa
+                       OP %G, tmpa
+                       PUT tmpa, %G
+*/
+static
+UInt dis_op2_E_G ( UChar       sorb,
+                   Bool        addSubCarry,
+                   IROp        op8, 
+                   Bool        keep,
+                   Int         size, 
+                   Int         delta0,
+                   HChar*      t_x86opc )
+{
+   HChar   dis_buf[50];
+   Int     len;
+   IRType  ty   = szToITy(size);
+   IRTemp  dst1 = newTemp(ty);
+   IRTemp  src  = newTemp(ty);
+   IRTemp  dst0 = newTemp(ty);
+   UChar   rm   = getUChar(delta0);
+   IRTemp  addr = IRTemp_INVALID;
+
+   /* addSubCarry == True indicates the intended operation is
+      add-with-carry or subtract-with-borrow. */
+   if (addSubCarry) {
+      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
+      vassert(keep);
+   }
+
+   if (epartIsReg(rm)) {
+      /* Specially handle XOR reg,reg, because that doesn't really
+         depend on reg, and doing the obvious thing potentially
+         generates a spurious value check failure due to the bogus
+         dependency.  Ditto SBB reg,reg. */
+      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
+          && gregOfRM(rm) == eregOfRM(rm)) {
+         putIReg(size, gregOfRM(rm), mkU(ty,0));
+      }
+      assign( dst0, getIReg(size,gregOfRM(rm)) );
+      assign( src,  getIReg(size,eregOfRM(rm)) );
+
+      if (addSubCarry && op8 == Iop_Add8) {
+         helper_ADC( size, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+         putIReg(size, gregOfRM(rm), mkexpr(dst1));
+      } else
+      if (addSubCarry && op8 == Iop_Sub8) {
+         helper_SBB( size, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+         putIReg(size, gregOfRM(rm), mkexpr(dst1));
+      } else {
+         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
+         if (isAddSub(op8))
+            setFlags_DEP1_DEP2(op8, dst0, src, ty);
+         else
+            setFlags_DEP1(op8, dst1, ty);
+         if (keep)
+            putIReg(size, gregOfRM(rm), mkexpr(dst1));
+      }
+
+      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size), 
+                          nameIReg(size,eregOfRM(rm)),
+                          nameIReg(size,gregOfRM(rm)));
+      return 1+delta0;
+   } else {
+      /* E refers to memory */
+      addr = disAMode ( &len, sorb, delta0, dis_buf);
+      assign( dst0, getIReg(size,gregOfRM(rm)) );
+      assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
+
+      if (addSubCarry && op8 == Iop_Add8) {
+         helper_ADC( size, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+         putIReg(size, gregOfRM(rm), mkexpr(dst1));
+      } else
+      if (addSubCarry && op8 == Iop_Sub8) {
+         helper_SBB( size, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+         putIReg(size, gregOfRM(rm), mkexpr(dst1));
+      } else {
+         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
+         if (isAddSub(op8))
+            setFlags_DEP1_DEP2(op8, dst0, src, ty);
+         else
+            setFlags_DEP1(op8, dst1, ty);
+         if (keep)
+            putIReg(size, gregOfRM(rm), mkexpr(dst1));
+      }
+
+      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size), 
+                          dis_buf,nameIReg(size,gregOfRM(rm)));
+      return len+delta0;
+   }
+}
+
+
+
+/* Handle binary integer instructions of the form
+      op G, E  meaning
+      op reg, reg-or-mem
+   Is passed the a ptr to the modRM byte, the actual operation, and the
+   data size.  Returns the address advanced completely over this
+   instruction.
+
+   G(src) is reg.
+   E(dst) is reg-or-mem
+
+   If E is reg, -->    GET %E,  tmp
+                       OP %G,   tmp
+                       PUT tmp, %E
+ 
+   If E is mem, -->    (getAddr E) -> tmpa
+                       LD (tmpa), tmpv
+                       OP %G, tmpv
+                       ST tmpv, (tmpa)
+*/
+static
+UInt dis_op2_G_E ( UChar       sorb,
+                   Bool        locked,
+                   Bool        addSubCarry,
+                   IROp        op8, 
+                   Bool        keep,
+                   Int         size, 
+                   Int         delta0,
+                   HChar*      t_x86opc )
+{
+   HChar   dis_buf[50];
+   Int     len;
+   IRType  ty   = szToITy(size);
+   IRTemp  dst1 = newTemp(ty);
+   IRTemp  src  = newTemp(ty);
+   IRTemp  dst0 = newTemp(ty);
+   UChar   rm   = getIByte(delta0);
+   IRTemp  addr = IRTemp_INVALID;
+
+   /* addSubCarry == True indicates the intended operation is
+      add-with-carry or subtract-with-borrow. */
+   if (addSubCarry) {
+      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
+      vassert(keep);
+   }
+
+   if (epartIsReg(rm)) {
+      /* Specially handle XOR reg,reg, because that doesn't really
+         depend on reg, and doing the obvious thing potentially
+         generates a spurious value check failure due to the bogus
+         dependency.  Ditto SBB reg,reg.*/
+      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
+          && gregOfRM(rm) == eregOfRM(rm)) {
+         putIReg(size, eregOfRM(rm), mkU(ty,0));
+      }
+      assign(dst0, getIReg(size,eregOfRM(rm)));
+      assign(src,  getIReg(size,gregOfRM(rm)));
+
+      if (addSubCarry && op8 == Iop_Add8) {
+         helper_ADC( size, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+         putIReg(size, eregOfRM(rm), mkexpr(dst1));
+      } else
+      if (addSubCarry && op8 == Iop_Sub8) {
+         helper_SBB( size, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+         putIReg(size, eregOfRM(rm), mkexpr(dst1));
+      } else {
+         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
+         if (isAddSub(op8))
+            setFlags_DEP1_DEP2(op8, dst0, src, ty);
+         else
+            setFlags_DEP1(op8, dst1, ty);
+         if (keep)
+            putIReg(size, eregOfRM(rm), mkexpr(dst1));
+      }
+
+      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size), 
+                          nameIReg(size,gregOfRM(rm)),
+                          nameIReg(size,eregOfRM(rm)));
+      return 1+delta0;
+   }
+
+   /* E refers to memory */    
+   {
+      addr = disAMode ( &len, sorb, delta0, dis_buf);
+      assign(dst0, loadLE(ty,mkexpr(addr)));
+      assign(src,  getIReg(size,gregOfRM(rm)));
+
+      if (addSubCarry && op8 == Iop_Add8) {
+         if (locked) {
+            /* cas-style store */
+            helper_ADC( size, dst1, dst0, src,
+                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
+         } else {
+            /* normal store */
+            helper_ADC( size, dst1, dst0, src,
+                        /*store*/addr, IRTemp_INVALID, 0 );
+         }
+      } else
+      if (addSubCarry && op8 == Iop_Sub8) {
+         if (locked) {
+            /* cas-style store */
+            helper_SBB( size, dst1, dst0, src,
+                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
+         } else {
+            /* normal store */
+            helper_SBB( size, dst1, dst0, src,
+                        /*store*/addr, IRTemp_INVALID, 0 );
+         }
+      } else {
+         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
+         if (keep) {
+            if (locked) {
+               if (0) vex_printf("locked case\n" );
+               casLE( mkexpr(addr),
+                      mkexpr(dst0)/*expval*/, 
+                      mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
+            } else {
+               if (0) vex_printf("nonlocked case\n");
+               storeLE(mkexpr(addr), mkexpr(dst1));
+            }
+         }
+         if (isAddSub(op8))
+            setFlags_DEP1_DEP2(op8, dst0, src, ty);
+         else
+            setFlags_DEP1(op8, dst1, ty);
+      }
+
+      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size), 
+                          nameIReg(size,gregOfRM(rm)), dis_buf);
+      return len+delta0;
+   }
+}
+
+
+/* Handle move instructions of the form
+      mov E, G  meaning
+      mov reg-or-mem, reg
+   Is passed the a ptr to the modRM byte, and the data size.  Returns
+   the address advanced completely over this instruction.
+
+   E(src) is reg-or-mem
+   G(dst) is reg.
+
+   If E is reg, -->    GET %E,  tmpv
+                       PUT tmpv, %G
+ 
+   If E is mem  -->    (getAddr E) -> tmpa
+                       LD (tmpa), tmpb
+                       PUT tmpb, %G
+*/
+static
+UInt dis_mov_E_G ( UChar       sorb,
+                   Int         size, 
+                   Int         delta0 )
+{
+   Int len;
+   UChar rm = getIByte(delta0);
+   HChar dis_buf[50];
+
+   if (epartIsReg(rm)) {
+      putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
+      DIP("mov%c %s,%s\n", nameISize(size), 
+                           nameIReg(size,eregOfRM(rm)),
+                           nameIReg(size,gregOfRM(rm)));
+      return 1+delta0;
+   }
+
+   /* E refers to memory */    
+   {
+      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
+      putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
+      DIP("mov%c %s,%s\n", nameISize(size), 
+                           dis_buf,nameIReg(size,gregOfRM(rm)));
+      return delta0+len;
+   }
+}
+
+
+/* Handle move instructions of the form
+      mov G, E  meaning
+      mov reg, reg-or-mem
+   Is passed the a ptr to the modRM byte, and the data size.  Returns
+   the address advanced completely over this instruction.
+
+   G(src) is reg.
+   E(dst) is reg-or-mem
+
+   If E is reg, -->    GET %G,  tmp
+                       PUT tmp, %E
+ 
+   If E is mem, -->    (getAddr E) -> tmpa
+                       GET %G, tmpv
+                       ST tmpv, (tmpa) 
+*/
+static
+UInt dis_mov_G_E ( UChar       sorb,
+                   Int         size, 
+                   Int         delta0 )
+{
+   Int len;
+   UChar rm = getIByte(delta0);
+   HChar dis_buf[50];
+
+   if (epartIsReg(rm)) {
+      putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
+      DIP("mov%c %s,%s\n", nameISize(size), 
+                           nameIReg(size,gregOfRM(rm)),
+                           nameIReg(size,eregOfRM(rm)));
+      return 1+delta0;
+   }
+
+   /* E refers to memory */    
+   {
+      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
+      storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
+      DIP("mov%c %s,%s\n", nameISize(size), 
+                           nameIReg(size,gregOfRM(rm)), dis_buf);
+      return len+delta0;
+   }
+}
+
+
+/* op $immediate, AL/AX/EAX. */
+static
+UInt dis_op_imm_A ( Int    size,
+                    Bool   carrying,
+                    IROp   op8,
+                    Bool   keep,
+                    Int    delta,
+                    HChar* t_x86opc )
+{
+   IRType ty   = szToITy(size);
+   IRTemp dst0 = newTemp(ty);
+   IRTemp src  = newTemp(ty);
+   IRTemp dst1 = newTemp(ty);
+   UInt lit    = getUDisp(size,delta);
+   assign(dst0, getIReg(size,R_EAX));
+   assign(src,  mkU(ty,lit));
+
+   if (isAddSub(op8) && !carrying) {
+      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
+      setFlags_DEP1_DEP2(op8, dst0, src, ty);
+   } 
+   else
+   if (isLogic(op8)) {
+      vassert(!carrying);
+      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
+      setFlags_DEP1(op8, dst1, ty);
+   } 
+   else
+   if (op8 == Iop_Add8 && carrying) {
+      helper_ADC( size, dst1, dst0, src,
+                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+   }
+   else
+   if (op8 == Iop_Sub8 && carrying) {
+      helper_SBB( size, dst1, dst0, src,
+                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+   }
+   else
+      vpanic("dis_op_imm_A(x86,guest)");
+
+   if (keep)
+      putIReg(size, R_EAX, mkexpr(dst1));
+
+   DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size), 
+                           lit, nameIReg(size,R_EAX));
+   return delta+size;
+}
+
+
+/* Sign- and Zero-extending moves. */
+static
+UInt dis_movx_E_G ( UChar      sorb,
+                    Int delta, Int szs, Int szd, Bool sign_extend )
+{
+   UChar rm = getIByte(delta);
+   if (epartIsReg(rm)) {
+      if (szd == szs) {
+         // mutant case.  See #250799
+         putIReg(szd, gregOfRM(rm),
+                           getIReg(szs,eregOfRM(rm)));
+      } else {
+         // normal case
+         putIReg(szd, gregOfRM(rm),
+                      unop(mkWidenOp(szs,szd,sign_extend), 
+                           getIReg(szs,eregOfRM(rm))));
+      }
+      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
+                               nameISize(szs), nameISize(szd),
+                               nameIReg(szs,eregOfRM(rm)),
+                               nameIReg(szd,gregOfRM(rm)));
+      return 1+delta;
+   }
+
+   /* E refers to memory */    
+   {
+      Int    len;
+      HChar  dis_buf[50];
+      IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
+      if (szd == szs) {
+         // mutant case.  See #250799
+         putIReg(szd, gregOfRM(rm),
+                           loadLE(szToITy(szs),mkexpr(addr)));
+      } else {
+         // normal case
+         putIReg(szd, gregOfRM(rm),
+                      unop(mkWidenOp(szs,szd,sign_extend), 
+                           loadLE(szToITy(szs),mkexpr(addr))));
+      }
+      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
+                               nameISize(szs), nameISize(szd),
+                               dis_buf, nameIReg(szd,gregOfRM(rm)));
+      return len+delta;
+   }
+}
+
+
+/* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
+   16 / 8 bit quantity in the given IRTemp.  */
+static
+void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
+{
+   IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
+   IRTemp src64 = newTemp(Ity_I64);
+   IRTemp dst64 = newTemp(Ity_I64);
+   switch (sz) {
+      case 4:
+         assign( src64, binop(Iop_32HLto64, 
+                              getIReg(4,R_EDX), getIReg(4,R_EAX)) );
+         assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
+         putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
+         putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
+         break;
+      case 2: {
+         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
+         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
+         assign( src64, unop(widen3264,
+                             binop(Iop_16HLto32, 
+                                   getIReg(2,R_EDX), getIReg(2,R_EAX))) );
+         assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
+         putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
+         putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
+         break;
+      }
+      case 1: {
+         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
+         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
+         IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
+         assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
+         assign( dst64, 
+                 binop(op, mkexpr(src64), 
+                           unop(widen1632, unop(widen816, mkexpr(t)))) );
+         putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
+                           unop(Iop_64to32,mkexpr(dst64)))) );
+         putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
+                           unop(Iop_64HIto32,mkexpr(dst64)))) );
+         break;
+      }
+      default: vpanic("codegen_div(x86)");
+   }
+}
+
+
+static 
+UInt dis_Grp1 ( UChar sorb, Bool locked,
+                Int delta, UChar modrm, 
+                Int am_sz, Int d_sz, Int sz, UInt d32 )
+{
+   Int     len;
+   HChar   dis_buf[50];
+   IRType  ty   = szToITy(sz);
+   IRTemp  dst1 = newTemp(ty);
+   IRTemp  src  = newTemp(ty);
+   IRTemp  dst0 = newTemp(ty);
+   IRTemp  addr = IRTemp_INVALID;
+   IROp    op8  = Iop_INVALID;
+   UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
+
+   switch (gregOfRM(modrm)) {
+      case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
+      case 2: break;  // ADC
+      case 3: break;  // SBB
+      case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
+      case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
+      /*NOTREACHED*/
+      default: vpanic("dis_Grp1: unhandled case");
+   }
+
+   if (epartIsReg(modrm)) {
+      vassert(am_sz == 1);
+
+      assign(dst0, getIReg(sz,eregOfRM(modrm)));
+      assign(src,  mkU(ty,d32 & mask));
+
+      if (gregOfRM(modrm) == 2 /* ADC */) {
+         helper_ADC( sz, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+      } else 
+      if (gregOfRM(modrm) == 3 /* SBB */) {
+         helper_SBB( sz, dst1, dst0, src,
+                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
+      } else {
+         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
+         if (isAddSub(op8))
+            setFlags_DEP1_DEP2(op8, dst0, src, ty);
+         else
+            setFlags_DEP1(op8, dst1, ty);
+      }
+
+      if (gregOfRM(modrm) < 7)
+         putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
+
+      delta += (am_sz + d_sz);
+      DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32, 
+                              nameIReg(sz,eregOfRM(modrm)));
+   } else {
+      addr = disAMode ( &len, sorb, delta, dis_buf);
+
+      assign(dst0, loadLE(ty,mkexpr(addr)));
+      assign(src, mkU(ty,d32 & mask));
+
+      if (gregOfRM(modrm) == 2 /* ADC */) {
+         if (locked) {
+            /* cas-style store */
+            helper_ADC( sz, dst1, dst0, src,
+                       /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
+         } else {
+            /* normal store */
+            helper_ADC( sz, dst1, dst0, src,
+                        /*store*/addr, IRTemp_INVALID, 0 );
+         }
+      } else 
+      if (gregOfRM(modrm) == 3 /* SBB */) {
+         if (locked) {
+            /* cas-style store */
+            helper_SBB( sz, dst1, dst0, src,
+                       /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
+         } else {
+            /* normal store */
+            helper_SBB( sz, dst1, dst0, src,
+                        /*store*/addr, IRTemp_INVALID, 0 );
+         }
+      } else {
+         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
+         if (gregOfRM(modrm) < 7) {
+            if (locked) {
+               casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/, 
+                                    mkexpr(dst1)/*newVal*/,
+                                    guest_EIP_curr_instr );
+            } else {
+               storeLE(mkexpr(addr), mkexpr(dst1));
+            }
+         }
+         if (isAddSub(op8))
+            setFlags_DEP1_DEP2(op8, dst0, src, ty);
+         else
+            setFlags_DEP1(op8, dst1, ty);
+      }
+
+      delta += (len+d_sz);
+      DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
+                              d32, dis_buf);
+   }
+   return delta;
+}
+
+
+/* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
+   expression. */
+
+static
+UInt dis_Grp2 ( UChar sorb,
+                Int delta, UChar modrm,
+                Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
+                HChar* shift_expr_txt, Bool* decode_OK )
+{
+   /* delta on entry points at the modrm byte. */
+   HChar  dis_buf[50];
+   Int    len;
+   Bool   isShift, isRotate, isRotateC;
+   IRType ty    = szToITy(sz);
+   IRTemp dst0  = newTemp(ty);
+   IRTemp dst1  = newTemp(ty);
+   IRTemp addr  = IRTemp_INVALID;
+
+   *decode_OK = True;
+
+   vassert(sz == 1 || sz == 2 || sz == 4);
+
+   /* Put value to shift/rotate in dst0. */
+   if (epartIsReg(modrm)) {
+      assign(dst0, getIReg(sz, eregOfRM(modrm)));
+      delta += (am_sz + d_sz);
+   } else {
+      addr = disAMode ( &len, sorb, delta, dis_buf);
+      assign(dst0, loadLE(ty,mkexpr(addr)));
+      delta += len + d_sz;
+   }
+
+   isShift = False;
+   switch (gregOfRM(modrm)) { case 4: case 5: case 7: isShift = True; }
+
+   isRotate = False;
+   switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
+
+   isRotateC = False;
+   switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
+
+   if (gregOfRM(modrm) == 6) {
+      *decode_OK = False;
+      return delta;
+   }
+
+   if (!isShift && !isRotate && !isRotateC) {
+      /*NOTREACHED*/
+      vpanic("dis_Grp2(Reg): unhandled case(x86)");
+   }
+
+   if (isRotateC) {
+      /* call a helper; these insns are so ridiculous they do not
+         deserve better */
+      Bool     left = toBool(gregOfRM(modrm) == 2);
+      IRTemp   r64  = newTemp(Ity_I64);
+      IRExpr** args 
+         = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
+                          widenUto32(shift_expr),   /* rotate amount */
+                          widenUto32(mk_x86g_calculate_eflags_all()),
+                          mkU32(sz) );
+      assign( r64, mkIRExprCCall(
+                      Ity_I64, 
+                      0/*regparm*/, 
+                      left ? "x86g_calculate_RCL" : "x86g_calculate_RCR", 
+                      left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
+                      args
+                   )
+            );
+      /* new eflags in hi half r64; new value in lo half r64 */
+      assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+      /* Set NDEP even though it isn't used.  This makes redundant-PUT
+         elimination of previous stores to this field work better. */
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+   }
+
+   if (isShift) {
+
+      IRTemp pre32     = newTemp(Ity_I32);
+      IRTemp res32     = newTemp(Ity_I32);
+      IRTemp res32ss   = newTemp(Ity_I32);
+      IRTemp shift_amt = newTemp(Ity_I8);
+      IROp   op32;
+
+      switch (gregOfRM(modrm)) { 
+         case 4: op32 = Iop_Shl32; break;
+         case 5: op32 = Iop_Shr32; break;
+         case 7: op32 = Iop_Sar32; break;
+         /*NOTREACHED*/
+         default: vpanic("dis_Grp2:shift"); break;
+      }
+
+      /* Widen the value to be shifted to 32 bits, do the shift, and
+         narrow back down.  This seems surprisingly long-winded, but
+         unfortunately the Intel semantics requires that 8/16-bit
+         shifts give defined results for shift values all the way up
+         to 31, and this seems the simplest way to do it.  It has the
+         advantage that the only IR level shifts generated are of 32
+         bit values, and the shift amount is guaranteed to be in the
+         range 0 .. 31, thereby observing the IR semantics requiring
+         all shift values to be in the range 0 .. 2^word_size-1. */
+
+      /* shift_amt = shift_expr & 31, regardless of operation size */
+      assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
+
+      /* suitably widen the value to be shifted to 32 bits. */
+      assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
+                                     : widenUto32(mkexpr(dst0)) );
+
+      /* res32 = pre32 `shift` shift_amt */
+      assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
+
+      /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
+      assign( res32ss,
+              binop(op32,
+                    mkexpr(pre32), 
+                    binop(Iop_And8,
+                          binop(Iop_Sub8,
+                                mkexpr(shift_amt), mkU8(1)),
+                          mkU8(31))) );
+
+      /* Build the flags thunk. */
+      setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
+
+      /* Narrow the result back down. */
+      assign( dst1, narrowTo(ty, mkexpr(res32)) );
+
+   } /* if (isShift) */
+
+   else 
+   if (isRotate) {
+      Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
+      Bool   left      = toBool(gregOfRM(modrm) == 0);
+      IRTemp rot_amt   = newTemp(Ity_I8);
+      IRTemp rot_amt32 = newTemp(Ity_I8);
+      IRTemp oldFlags  = newTemp(Ity_I32);
+
+      /* rot_amt = shift_expr & mask */
+      /* By masking the rotate amount thusly, the IR-level Shl/Shr
+         expressions never shift beyond the word size and thus remain
+         well defined. */
+      assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
+
+      if (ty == Ity_I32)
+         assign(rot_amt, mkexpr(rot_amt32));
+      else
+         assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
+
+      if (left) {
+
+         /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
+         assign(dst1, 
+            binop( mkSizedOp(ty,Iop_Or8),
+                   binop( mkSizedOp(ty,Iop_Shl8), 
+                          mkexpr(dst0),
+                          mkexpr(rot_amt)
+                   ),
+                   binop( mkSizedOp(ty,Iop_Shr8), 
+                          mkexpr(dst0), 
+                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
+                   )
+            )
+         );
+         ccOp += X86G_CC_OP_ROLB;
+
+      } else { /* right */
+
+         /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
+         assign(dst1, 
+            binop( mkSizedOp(ty,Iop_Or8),
+                   binop( mkSizedOp(ty,Iop_Shr8), 
+                          mkexpr(dst0),
+                          mkexpr(rot_amt)
+                   ),
+                   binop( mkSizedOp(ty,Iop_Shl8), 
+                          mkexpr(dst0), 
+                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
+                   )
+            )
+         );
+         ccOp += X86G_CC_OP_RORB;
+
+      }
+
+      /* dst1 now holds the rotated value.  Build flag thunk.  We
+         need the resulting value for this, and the previous flags.
+         Except don't set it if the rotate count is zero. */
+
+      assign(oldFlags, mk_x86g_calculate_eflags_all());
+
+      /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
+      stmt( IRStmt_Put( OFFB_CC_OP,
+                        IRExpr_Mux0X( mkexpr(rot_amt32),
+                                      IRExpr_Get(OFFB_CC_OP,Ity_I32),
+                                      mkU32(ccOp))) );
+      stmt( IRStmt_Put( OFFB_CC_DEP1, 
+                        IRExpr_Mux0X( mkexpr(rot_amt32),
+                                      IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
+                                      widenUto32(mkexpr(dst1)))) );
+      stmt( IRStmt_Put( OFFB_CC_DEP2, 
+                        IRExpr_Mux0X( mkexpr(rot_amt32),
+                                      IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
+                                      mkU32(0))) );
+      stmt( IRStmt_Put( OFFB_CC_NDEP, 
+                        IRExpr_Mux0X( mkexpr(rot_amt32),
+                                      IRExpr_Get(OFFB_CC_NDEP,Ity_I32),
+                                      mkexpr(oldFlags))) );
+   } /* if (isRotate) */
+
+   /* Save result, and finish up. */
+   if (epartIsReg(modrm)) {
+      putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
+      if (vex_traceflags & VEX_TRACE_FE) {
+         vex_printf("%s%c ",
+                    nameGrp2(gregOfRM(modrm)), nameISize(sz) );
+         if (shift_expr_txt)
+            vex_printf("%s", shift_expr_txt);
+         else
+            ppIRExpr(shift_expr);
+         vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
+      }
+   } else {
+      storeLE(mkexpr(addr), mkexpr(dst1));
+      if (vex_traceflags & VEX_TRACE_FE) {
+         vex_printf("%s%c ",
+                    nameGrp2(gregOfRM(modrm)), nameISize(sz) );
+         if (shift_expr_txt)
+            vex_printf("%s", shift_expr_txt);
+         else
+            ppIRExpr(shift_expr);
+         vex_printf(", %s\n", dis_buf);
+      }
+   }
+   return delta;
+}
+
+
+/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
+static
+UInt dis_Grp8_Imm ( UChar sorb,
+                    Bool locked,
+                    Int delta, UChar modrm,
+                    Int am_sz, Int sz, UInt src_val,
+                    Bool* decode_OK )
+{
+   /* src_val denotes a d8.
+      And delta on entry points at the modrm byte. */
+
+   IRType ty     = szToITy(sz);
+   IRTemp t2     = newTemp(Ity_I32);
+   IRTemp t2m    = newTemp(Ity_I32);
+   IRTemp t_addr = IRTemp_INVALID;
+   HChar  dis_buf[50];
+   UInt   mask;
+
+   /* we're optimists :-) */
+   *decode_OK = True;
+
+   /* Limit src_val -- the bit offset -- to something within a word.
+      The Intel docs say that literal offsets larger than a word are
+      masked in this way. */
+   switch (sz) {
+      case 2:  src_val &= 15; break;
+      case 4:  src_val &= 31; break;
+      default: *decode_OK = False; return delta;
+   }
+
+   /* Invent a mask suitable for the operation. */
+   switch (gregOfRM(modrm)) {
+      case 4: /* BT */  mask = 0;               break;
+      case 5: /* BTS */ mask = 1 << src_val;    break;
+      case 6: /* BTR */ mask = ~(1 << src_val); break;
+      case 7: /* BTC */ mask = 1 << src_val;    break;
+         /* If this needs to be extended, probably simplest to make a
+            new function to handle the other cases (0 .. 3).  The
+            Intel docs do however not indicate any use for 0 .. 3, so
+            we don't expect this to happen. */
+      default: *decode_OK = False; return delta;
+   }
+
+   /* Fetch the value to be tested and modified into t2, which is
+      32-bits wide regardless of sz. */
+   if (epartIsReg(modrm)) {
+      vassert(am_sz == 1);
+      assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
+      delta += (am_sz + 1);
+      DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
+                              src_val, nameIReg(sz,eregOfRM(modrm)));
+   } else {
+      Int len;
+      t_addr = disAMode ( &len, sorb, delta, dis_buf);
+      delta  += (len+1);
+      assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
+      DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
+                              src_val, dis_buf);
+   }
+
+   /* Compute the new value into t2m, if non-BT. */
+   switch (gregOfRM(modrm)) {
+      case 4: /* BT */
+         break;
+      case 5: /* BTS */
+         assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
+         break;
+      case 6: /* BTR */
+         assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
+         break;
+      case 7: /* BTC */
+         assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
+         break;
+      default: 
+         /*NOTREACHED*/ /*the previous switch guards this*/
+         vassert(0);
+   }
+
+   /* Write the result back, if non-BT.  If the CAS fails then we
+      side-exit from the trace at this point, and so the flag state is
+      not affected.  This is of course as required. */
+   if (gregOfRM(modrm) != 4 /* BT */) {
+      if (epartIsReg(modrm)) {
+         putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
+      } else {
+         if (locked) {
+            casLE( mkexpr(t_addr),
+                   narrowTo(ty, mkexpr(t2))/*expd*/,
+                   narrowTo(ty, mkexpr(t2m))/*new*/,
+                   guest_EIP_curr_instr );
+         } else {
+            storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
+         }
+      }
+   }
+
+   /* Copy relevant bit from t2 into the carry flag. */
+   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+   stmt( IRStmt_Put( 
+            OFFB_CC_DEP1,
+            binop(Iop_And32,
+                  binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
+                  mkU32(1))
+       ));
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+
+   return delta;
+}
+
+
+/* Signed/unsigned widening multiply.  Generate IR to multiply the
+   value in EAX/AX/AL by the given IRTemp, and park the result in
+   EDX:EAX/DX:AX/AX.
+*/
+static void codegen_mulL_A_D ( Int sz, Bool syned, 
+                               IRTemp tmp, HChar* tmp_txt )
+{
+   IRType ty = szToITy(sz);
+   IRTemp t1 = newTemp(ty);
+
+   assign( t1, getIReg(sz, R_EAX) );
+
+   switch (ty) {
+      case Ity_I32: {
+         IRTemp res64   = newTemp(Ity_I64);
+         IRTemp resHi   = newTemp(Ity_I32);
+         IRTemp resLo   = newTemp(Ity_I32);
+         IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
+         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
+         setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
+         assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
+         assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
+         assign( resLo, unop(Iop_64to32,mkexpr(res64)));
+         putIReg(4, R_EDX, mkexpr(resHi));
+         putIReg(4, R_EAX, mkexpr(resLo));
+         break;
+      }
+      case Ity_I16: {
+         IRTemp res32   = newTemp(Ity_I32);
+         IRTemp resHi   = newTemp(Ity_I16);
+         IRTemp resLo   = newTemp(Ity_I16);
+         IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
+         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
+         setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
+         assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
+         assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
+         assign( resLo, unop(Iop_32to16,mkexpr(res32)));
+         putIReg(2, R_EDX, mkexpr(resHi));
+         putIReg(2, R_EAX, mkexpr(resLo));
+         break;
+      }
+      case Ity_I8: {
+         IRTemp res16   = newTemp(Ity_I16);
+         IRTemp resHi   = newTemp(Ity_I8);
+         IRTemp resLo   = newTemp(Ity_I8);
+         IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
+         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
+         setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
+         assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
+         assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
+         assign( resLo, unop(Iop_16to8,mkexpr(res16)));
+         putIReg(2, R_EAX, mkexpr(res16));
+         break;
+      }
+      default:
+         vpanic("codegen_mulL_A_D(x86)");
+   }
+   DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
+}
+
+
+/* Group 3 extended opcodes. */
+static 
+UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
+{
+   UInt    d32;
+   UChar   modrm;
+   HChar   dis_buf[50];
+   Int     len;
+   IRTemp  addr;
+   IRType  ty = szToITy(sz);
+   IRTemp  t1 = newTemp(ty);
+   IRTemp dst1, src, dst0;
+
+   *decode_OK = True; /* may change this later */
+
+   modrm = getIByte(delta);
+
+   if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
+      /* LOCK prefix only allowed with not and neg subopcodes */
+      *decode_OK = False;
+      return delta;
+   }
+
+   if (epartIsReg(modrm)) {
+      switch (gregOfRM(modrm)) {
+         case 0: { /* TEST */
+            delta++; d32 = getUDisp(sz, delta); delta += sz;
+            dst1 = newTemp(ty);
+            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
+                               getIReg(sz,eregOfRM(modrm)),
+                               mkU(ty,d32)));
+            setFlags_DEP1( Iop_And8, dst1, ty );
+            DIP("test%c $0x%x, %s\n", nameISize(sz), d32, 
+                                      nameIReg(sz, eregOfRM(modrm)));
+            break;
+         }
+         case 1: /* UNDEFINED */
+           /* The Intel docs imply this insn is undefined and binutils
+              agrees.  Unfortunately Core 2 will run it (with who
+              knows what result?)  sandpile.org reckons it's an alias
+              for case 0.  We play safe. */
+           *decode_OK = False;
+           break;
+         case 2: /* NOT */
+            delta++;
+            putIReg(sz, eregOfRM(modrm),
+                        unop(mkSizedOp(ty,Iop_Not8),
+                             getIReg(sz, eregOfRM(modrm))));
+            DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
+            break;
+         case 3: /* NEG */
+            delta++;
+            dst0 = newTemp(ty);
+            src  = newTemp(ty);
+            dst1 = newTemp(ty);
+            assign(dst0, mkU(ty,0));
+            assign(src,  getIReg(sz,eregOfRM(modrm)));
+            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
+            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
+            putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
+            DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
+            break;
+         case 4: /* MUL (unsigned widening) */
+            delta++;
+            src = newTemp(ty);
+            assign(src, getIReg(sz,eregOfRM(modrm)));
+            codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
+            break;
+         case 5: /* IMUL (signed widening) */
+            delta++;
+            src = newTemp(ty);
+            assign(src, getIReg(sz,eregOfRM(modrm)));
+            codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
+            break;
+         case 6: /* DIV */
+            delta++;
+            assign( t1, getIReg(sz, eregOfRM(modrm)) );
+            codegen_div ( sz, t1, False );
+            DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
+            break;
+         case 7: /* IDIV */
+            delta++;
+            assign( t1, getIReg(sz, eregOfRM(modrm)) );
+            codegen_div ( sz, t1, True );
+            DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
+            break;
+         default: 
+            /* This can't happen - gregOfRM should return 0 .. 7 only */
+            vpanic("Grp3(x86)");
+      }
+   } else {
+      addr = disAMode ( &len, sorb, delta, dis_buf );
+      t1   = newTemp(ty);
+      delta += len;
+      assign(t1, loadLE(ty,mkexpr(addr)));
+      switch (gregOfRM(modrm)) {
+         case 0: { /* TEST */
+            d32 = getUDisp(sz, delta); delta += sz;
+            dst1 = newTemp(ty);
+            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
+                               mkexpr(t1), mkU(ty,d32)));
+            setFlags_DEP1( Iop_And8, dst1, ty );
+            DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
+            break;
+         }
+         case 1: /* UNDEFINED */
+           /* See comment above on R case */
+           *decode_OK = False;
+           break;
+         case 2: /* NOT */
+            dst1 = newTemp(ty);
+            assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
+            if (locked) {
+               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
+                                    guest_EIP_curr_instr );
+            } else {
+               storeLE( mkexpr(addr), mkexpr(dst1) );
+            }
+            DIP("not%c %s\n", nameISize(sz), dis_buf);
+            break;
+         case 3: /* NEG */
+            dst0 = newTemp(ty);
+            src  = newTemp(ty);
+            dst1 = newTemp(ty);
+            assign(dst0, mkU(ty,0));
+            assign(src,  mkexpr(t1));
+            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
+                               mkexpr(dst0), mkexpr(src)));
+            if (locked) {
+               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
+                                    guest_EIP_curr_instr );
+            } else {
+               storeLE( mkexpr(addr), mkexpr(dst1) );
+            }
+            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
+            DIP("neg%c %s\n", nameISize(sz), dis_buf);
+            break;
+         case 4: /* MUL */
+            codegen_mulL_A_D ( sz, False, t1, dis_buf );
+            break;
+         case 5: /* IMUL */
+            codegen_mulL_A_D ( sz, True, t1, dis_buf );
+            break;
+         case 6: /* DIV */
+            codegen_div ( sz, t1, False );
+            DIP("div%c %s\n", nameISize(sz), dis_buf);
+            break;
+         case 7: /* IDIV */
+            codegen_div ( sz, t1, True );
+            DIP("idiv%c %s\n", nameISize(sz), dis_buf);
+            break;
+         default: 
+            /* This can't happen - gregOfRM should return 0 .. 7 only */
+            vpanic("Grp3(x86)");
+      }
+   }
+   return delta;
+}
+
+
+/* Group 4 extended opcodes. */
+static
+UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
+{
+   Int   alen;
+   UChar modrm;
+   HChar dis_buf[50];
+   IRType ty = Ity_I8;
+   IRTemp t1 = newTemp(ty);
+   IRTemp t2 = newTemp(ty);
+
+   *decode_OK = True;
+
+   modrm = getIByte(delta);
+
+   if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
+      /* LOCK prefix only allowed with inc and dec subopcodes */
+      *decode_OK = False;
+      return delta;
+   }
+
+   if (epartIsReg(modrm)) {
+      assign(t1, getIReg(1, eregOfRM(modrm)));
+      switch (gregOfRM(modrm)) {
+         case 0: /* INC */
+            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
+            putIReg(1, eregOfRM(modrm), mkexpr(t2));
+            setFlags_INC_DEC( True, t2, ty );
+            break;
+         case 1: /* DEC */
+            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
+            putIReg(1, eregOfRM(modrm), mkexpr(t2));
+            setFlags_INC_DEC( False, t2, ty );
+            break;
+         default: 
+            *decode_OK = False;
+            return delta;
+      }
+      delta++;
+      DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
+                      nameIReg(1, eregOfRM(modrm)));
+   } else {
+      IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
+      assign( t1, loadLE(ty, mkexpr(addr)) );
+      switch (gregOfRM(modrm)) {
+         case 0: /* INC */
+            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
+            if (locked) {
+               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/, 
+                      guest_EIP_curr_instr );
+            } else {
+               storeLE( mkexpr(addr), mkexpr(t2) );
+            }
+            setFlags_INC_DEC( True, t2, ty );
+            break;
+         case 1: /* DEC */
+            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
+            if (locked) {
+               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/, 
+                      guest_EIP_curr_instr );
+            } else {
+               storeLE( mkexpr(addr), mkexpr(t2) );
+            }
+            setFlags_INC_DEC( False, t2, ty );
+            break;
+         default: 
+            *decode_OK = False;
+            return delta;
+      }
+      delta += alen;
+      DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
+   }
+   return delta;
+}
+
+
+/* Group 5 extended opcodes. */
+static
+UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta, 
+                DisResult* dres, Bool* decode_OK )
+{
+   Int     len;
+   UChar   modrm;
+   HChar   dis_buf[50];
+   IRTemp  addr = IRTemp_INVALID;
+   IRType  ty = szToITy(sz);
+   IRTemp  t1 = newTemp(ty);
+   IRTemp  t2 = IRTemp_INVALID;
+
+   *decode_OK = True;
+
+   modrm = getIByte(delta);
+
+   if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
+      /* LOCK prefix only allowed with inc and dec subopcodes */
+      *decode_OK = False;
+      return delta;
+   }
+
+   if (epartIsReg(modrm)) {
+      assign(t1, getIReg(sz,eregOfRM(modrm)));
+      switch (gregOfRM(modrm)) {
+         case 0: /* INC */ 
+            vassert(sz == 2 || sz == 4);
+            t2 = newTemp(ty);
+            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
+                             mkexpr(t1), mkU(ty,1)));
+            setFlags_INC_DEC( True, t2, ty );
+            putIReg(sz,eregOfRM(modrm),mkexpr(t2));
+            break;
+         case 1: /* DEC */ 
+            vassert(sz == 2 || sz == 4);
+            t2 = newTemp(ty);
+            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
+                             mkexpr(t1), mkU(ty,1)));
+            setFlags_INC_DEC( False, t2, ty );
+            putIReg(sz,eregOfRM(modrm),mkexpr(t2));
+            break;
+         case 2: /* call Ev */
+            vassert(sz == 4);
+            t2 = newTemp(Ity_I32);
+            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
+            putIReg(4, R_ESP, mkexpr(t2));
+            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
+            jmp_treg(Ijk_Call,t1);
+            dres->whatNext = Dis_StopHere;
+            break;
+         case 4: /* jmp Ev */
+            vassert(sz == 4);
+            jmp_treg(Ijk_Boring,t1);
+            dres->whatNext = Dis_StopHere;
+            break;
+         case 6: /* PUSH Ev */
+            vassert(sz == 4 || sz == 2);
+            t2 = newTemp(Ity_I32);
+            assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
+            putIReg(4, R_ESP, mkexpr(t2) );
+            storeLE( mkexpr(t2), mkexpr(t1) );
+            break;
+         default: 
+            *decode_OK = False;
+            return delta;
+      }
+      delta++;
+      DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
+                       nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
+   } else {
+      addr = disAMode ( &len, sorb, delta, dis_buf );
+      assign(t1, loadLE(ty,mkexpr(addr)));
+      switch (gregOfRM(modrm)) {
+         case 0: /* INC */ 
+            t2 = newTemp(ty);
+            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
+                             mkexpr(t1), mkU(ty,1)));
+            if (locked) {
+               casLE( mkexpr(addr),
+                      mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
+            } else {
+               storeLE(mkexpr(addr),mkexpr(t2));
+            }
+            setFlags_INC_DEC( True, t2, ty );
+            break;
+         case 1: /* DEC */ 
+            t2 = newTemp(ty);
+            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
+                             mkexpr(t1), mkU(ty,1)));
+            if (locked) {
+               casLE( mkexpr(addr),
+                      mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
+            } else {
+               storeLE(mkexpr(addr),mkexpr(t2));
+            }
+            setFlags_INC_DEC( False, t2, ty );
+            break;
+         case 2: /* call Ev */
+            vassert(sz == 4);
+            t2 = newTemp(Ity_I32);
+            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
+            putIReg(4, R_ESP, mkexpr(t2));
+            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
+            jmp_treg(Ijk_Call,t1);
+            dres->whatNext = Dis_StopHere;
+            break;
+         case 4: /* JMP Ev */
+            vassert(sz == 4);
+            jmp_treg(Ijk_Boring,t1);
+            dres->whatNext = Dis_StopHere;
+            break;
+         case 6: /* PUSH Ev */
+            vassert(sz == 4 || sz == 2);
+            t2 = newTemp(Ity_I32);
+            assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
+            putIReg(4, R_ESP, mkexpr(t2) );
+            storeLE( mkexpr(t2), mkexpr(t1) );
+            break;
+         default: 
+            *decode_OK = False;
+            return delta;
+      }
+      delta += len;
+      DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
+                       nameISize(sz), dis_buf);
+   }
+   return delta;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Disassembling string ops (including REP prefixes)    ---*/
+/*------------------------------------------------------------*/
+
+/* Code shared by all the string ops */
+static
+void dis_string_op_increment(Int sz, Int t_inc)
+{
+   if (sz == 4 || sz == 2) {
+      assign( t_inc, 
+              binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
+                               mkU8(sz/2) ) );
+   } else {
+      assign( t_inc, 
+              IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
+   }
+}
+
+static
+void dis_string_op( void (*dis_OP)( Int, IRTemp ), 
+                    Int sz, HChar* name, UChar sorb )
+{
+   IRTemp t_inc = newTemp(Ity_I32);
+   vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
+   dis_string_op_increment(sz, t_inc);
+   dis_OP( sz, t_inc );
+   DIP("%s%c\n", name, nameISize(sz));
+}
+
+static 
+void dis_MOVS ( Int sz, IRTemp t_inc )
+{
+   IRType ty = szToITy(sz);
+   IRTemp td = newTemp(Ity_I32);   /* EDI */
+   IRTemp ts = newTemp(Ity_I32);   /* ESI */
+
+   assign( td, getIReg(4, R_EDI) );
+   assign( ts, getIReg(4, R_ESI) );
+
+   storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
+
+   putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
+   putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
+}
+
+static 
+void dis_LODS ( Int sz, IRTemp t_inc )
+{
+   IRType ty = szToITy(sz);
+   IRTemp ts = newTemp(Ity_I32);   /* ESI */
+
+   assign( ts, getIReg(4, R_ESI) );
+
+   putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
+
+   putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
+}
+
+static 
+void dis_STOS ( Int sz, IRTemp t_inc )
+{
+   IRType ty = szToITy(sz);
+   IRTemp ta = newTemp(ty);        /* EAX */
+   IRTemp td = newTemp(Ity_I32);   /* EDI */
+
+   assign( ta, getIReg(sz, R_EAX) );
+   assign( td, getIReg(4, R_EDI) );
+
+   storeLE( mkexpr(td), mkexpr(ta) );
+
+   putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
+}
+
+static 
+void dis_CMPS ( Int sz, IRTemp t_inc )
+{
+   IRType ty  = szToITy(sz);
+   IRTemp tdv = newTemp(ty);      /* (EDI) */
+   IRTemp tsv = newTemp(ty);      /* (ESI) */
+   IRTemp td  = newTemp(Ity_I32); /*  EDI  */
+   IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
+
+   assign( td, getIReg(4, R_EDI) );
+   assign( ts, getIReg(4, R_ESI) );
+
+   assign( tdv, loadLE(ty,mkexpr(td)) );
+   assign( tsv, loadLE(ty,mkexpr(ts)) );
+
+   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
+
+   putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
+   putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
+}
+
+static 
+void dis_SCAS ( Int sz, IRTemp t_inc )
+{
+   IRType ty  = szToITy(sz);
+   IRTemp ta  = newTemp(ty);       /*  EAX  */
+   IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
+   IRTemp tdv = newTemp(ty);       /* (EDI) */
+
+   assign( ta, getIReg(sz, R_EAX) );
+   assign( td, getIReg(4, R_EDI) );
+
+   assign( tdv, loadLE(ty,mkexpr(td)) );
+   setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
+
+   putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
+}
+
+
+/* Wrap the appropriate string op inside a REP/REPE/REPNE.
+   We assume the insn is the last one in the basic block, and so emit a jump
+   to the next insn, rather than just falling through. */
+static 
+void dis_REP_op ( X86Condcode cond,
+                  void (*dis_OP)(Int, IRTemp),
+                  Int sz, Addr32 eip, Addr32 eip_next, HChar* name )
+{
+   IRTemp t_inc = newTemp(Ity_I32);
+   IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
+
+   assign( tc, getIReg(4,R_ECX) );
+
+   stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
+                      Ijk_Boring,
+                      IRConst_U32(eip_next) ) );
+
+   putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
+
+   dis_string_op_increment(sz, t_inc);
+   dis_OP (sz, t_inc);
+
+   if (cond == X86CondAlways) {
+      jmp_lit(Ijk_Boring,eip);
+   } else {
+      stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
+                         Ijk_Boring,
+                         IRConst_U32(eip) ) );
+      jmp_lit(Ijk_Boring,eip_next);
+   }
+   DIP("%s%c\n", name, nameISize(sz));
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Arithmetic, etc.                                     ---*/
+/*------------------------------------------------------------*/
+
+/* IMUL E, G.  Supplied eip points to the modR/M byte. */
+static
+UInt dis_mul_E_G ( UChar       sorb,
+                   Int         size, 
+                   Int         delta0 )
+{
+   Int    alen;
+   HChar  dis_buf[50];
+   UChar  rm = getIByte(delta0);
+   IRType ty = szToITy(size);
+   IRTemp te = newTemp(ty);
+   IRTemp tg = newTemp(ty);
+   IRTemp resLo = newTemp(ty);
+
+   assign( tg, getIReg(size, gregOfRM(rm)) );
+   if (epartIsReg(rm)) {
+      assign( te, getIReg(size, eregOfRM(rm)) );
+   } else {
+      IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
+      assign( te, loadLE(ty,mkexpr(addr)) );
+   }
+
+   setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
+
+   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
+
+   putIReg(size, gregOfRM(rm), mkexpr(resLo) );
+
+   if (epartIsReg(rm)) {
+      DIP("imul%c %s, %s\n", nameISize(size), 
+                             nameIReg(size,eregOfRM(rm)),
+                             nameIReg(size,gregOfRM(rm)));
+      return 1+delta0;
+   } else {
+      DIP("imul%c %s, %s\n", nameISize(size), 
+                             dis_buf, nameIReg(size,gregOfRM(rm)));
+      return alen+delta0;
+   }
+}
+
+
+/* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
+static
+UInt dis_imul_I_E_G ( UChar       sorb,
+                      Int         size, 
+                      Int         delta,
+                      Int         litsize )
+{
+   Int    d32, alen;
+   HChar  dis_buf[50];
+   UChar  rm = getIByte(delta);
+   IRType ty = szToITy(size);
+   IRTemp te = newTemp(ty);
+   IRTemp tl = newTemp(ty);
+   IRTemp resLo = newTemp(ty);
+
+   vassert(size == 1 || size == 2 || size == 4);
+
+   if (epartIsReg(rm)) {
+      assign(te, getIReg(size, eregOfRM(rm)));
+      delta++;
+   } else {
+      IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
+      assign(te, loadLE(ty, mkexpr(addr)));
+      delta += alen;
+   }
+   d32 = getSDisp(litsize,delta);
+   delta += litsize;
+
+   if (size == 1) d32 &= 0xFF;
+   if (size == 2) d32 &= 0xFFFF;
+
+   assign(tl, mkU(ty,d32));
+
+   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
+
+   setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
+
+   putIReg(size, gregOfRM(rm), mkexpr(resLo));
+
+   DIP("imul %d, %s, %s\n", d32, 
+       ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
+       nameIReg(size,gregOfRM(rm)) );
+   return delta;
+}
+
+
+/* Generate an IR sequence to do a count-leading-zeroes operation on
+   the supplied IRTemp, and return a new IRTemp holding the result.
+   'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
+   argument is zero, return the number of bits in the word (the
+   natural semantics). */
+static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
+{
+   vassert(ty == Ity_I32 || ty == Ity_I16);
+
+   IRTemp src32 = newTemp(Ity_I32);
+   assign(src32, widenUto32( mkexpr(src) ));
+
+   IRTemp src32x = newTemp(Ity_I32);
+   assign(src32x, 
+          binop(Iop_Shl32, mkexpr(src32),
+                           mkU8(32 - 8 * sizeofIRType(ty))));
+
+   // Clz32 has undefined semantics when its input is zero, so
+   // special-case around that.
+   IRTemp res32 = newTemp(Ity_I32);
+   assign(res32,
+          IRExpr_Mux0X(
+             unop(Iop_1Uto8,
+                  binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0))),
+             unop(Iop_Clz32, mkexpr(src32x)),
+             mkU32(8 * sizeofIRType(ty))
+   ));
+
+   IRTemp res = newTemp(ty);
+   assign(res, narrowTo(ty, mkexpr(res32)));
+   return res;
+}
+
+
+/*------------------------------------------------------------*/
+/*---                                                      ---*/
+/*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
+/*---                                                      ---*/
+/*------------------------------------------------------------*/
+
+/* --- Helper functions for dealing with the register stack. --- */
+
+/* --- Set the emulation-warning pseudo-register. --- */
+
+static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
+   stmt( IRStmt_Put( OFFB_EMWARN, e ) );
+}
+
+/* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
+
+static IRExpr* mkQNaN64 ( void )
+{
+  /* QNaN is 0 2047 1 0(51times) 
+     == 0b 11111111111b 1 0(51times)
+     == 0x7FF8 0000 0000 0000
+   */
+   return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
+}
+
+/* --------- Get/put the top-of-stack pointer. --------- */
+
+static IRExpr* get_ftop ( void )
+{
+   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
+}
+
+static void put_ftop ( IRExpr* e )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
+   stmt( IRStmt_Put( OFFB_FTOP, e ) );
+}
+
+/* --------- Get/put the C3210 bits. --------- */
+
+static IRExpr* get_C3210 ( void )
+{
+   return IRExpr_Get( OFFB_FC3210, Ity_I32 );
+}
+
+static void put_C3210 ( IRExpr* e )
+{
+   stmt( IRStmt_Put( OFFB_FC3210, e ) );
+}
+
+/* --------- Get/put the FPU rounding mode. --------- */
+static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
+{
+   return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
+}
+
+static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
+{
+   stmt( IRStmt_Put( OFFB_FPROUND, e ) );
+}
+
+
+/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
+/* Produces a value in 0 .. 3, which is encoded as per the type
+   IRRoundingMode.  Since the guest_FPROUND value is also encoded as
+   per IRRoundingMode, we merely need to get it and mask it for
+   safety.
+*/
+static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
+{
+   return binop( Iop_And32, get_fpround(), mkU32(3) );
+}
+
+static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
+{
+   return mkU32(Irrm_NEAREST);
+}
+
+
+/* --------- Get/set FP register tag bytes. --------- */
+
+/* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
+
+static void put_ST_TAG ( Int i, IRExpr* value )
+{
+   IRRegArray* descr;
+   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
+   descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
+   stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
+}
+
+/* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
+   zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
+
+static IRExpr* get_ST_TAG ( Int i )
+{
+   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
+   return IRExpr_GetI( descr, get_ftop(), i );
+}
+
+
+/* --------- Get/set FP registers. --------- */
+
+/* Given i, and some expression e, emit 'ST(i) = e' and set the
+   register's tag to indicate the register is full.  The previous
+   state of the register is not checked. */
+
+static void put_ST_UNCHECKED ( Int i, IRExpr* value )
+{
+   IRRegArray* descr;
+   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
+   descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
+   stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
+   /* Mark the register as in-use. */
+   put_ST_TAG(i, mkU8(1));
+}
+
+/* Given i, and some expression e, emit
+      ST(i) = is_full(i) ? NaN : e
+   and set the tag accordingly.
+*/
+
+static void put_ST ( Int i, IRExpr* value )
+{
+   put_ST_UNCHECKED( i,
+                     IRExpr_Mux0X( get_ST_TAG(i),
+                                   /* 0 means empty */
+                                   value,
+                                   /* non-0 means full */
+                                   mkQNaN64()
+                   )
+   );
+}
+
+
+/* Given i, generate an expression yielding 'ST(i)'. */
+
+static IRExpr* get_ST_UNCHECKED ( Int i )
+{
+   IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
+   return IRExpr_GetI( descr, get_ftop(), i );
+}
+
+
+/* Given i, generate an expression yielding 
+  is_full(i) ? ST(i) : NaN
+*/
+
+static IRExpr* get_ST ( Int i )
+{
+   return
+      IRExpr_Mux0X( get_ST_TAG(i),
+                    /* 0 means empty */
+                    mkQNaN64(),
+                    /* non-0 means full */
+                    get_ST_UNCHECKED(i));
+}
+
+
+/* Adjust FTOP downwards by one register. */
+
+static void fp_push ( void )
+{
+   put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
+}
+
+/* Adjust FTOP upwards by one register, and mark the vacated register
+   as empty.  */
+
+static void fp_pop ( void )
+{
+   put_ST_TAG(0, mkU8(0));
+   put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
+}
+
+/* Clear the C2 bit of the FPU status register, for
+   sin/cos/tan/sincos. */
+
+static void clear_C2 ( void )
+{
+   put_C3210( binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2)) );
+}
+
+/* Invent a plausible-looking FPU status word value:
+      ((ftop & 7) << 11) | (c3210 & 0x4700)
+ */
+static IRExpr* get_FPU_sw ( void )
+{
+   return
+      unop(Iop_32to16,
+           binop(Iop_Or32,
+                 binop(Iop_Shl32, 
+                       binop(Iop_And32, get_ftop(), mkU32(7)), 
+                             mkU8(11)),
+                       binop(Iop_And32, get_C3210(), mkU32(0x4700))
+      ));
+}
+
+
+/* ------------------------------------------------------- */
+/* Given all that stack-mangling junk, we can now go ahead
+   and describe FP instructions. 
+*/
+
+/* ST(0) = ST(0) `op` mem64/32(addr)
+   Need to check ST(0)'s tag on read, but not on write.
+*/
+static
+void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf, 
+                         IROp op, Bool dbl )
+{
+   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
+   if (dbl) {
+      put_ST_UNCHECKED(0, 
+         triop( op, 
+                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                get_ST(0), 
+                loadLE(Ity_F64,mkexpr(addr))
+         ));
+   } else {
+      put_ST_UNCHECKED(0, 
+         triop( op, 
+                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                get_ST(0), 
+                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
+         ));
+   }
+}
+
+
+/* ST(0) = mem64/32(addr) `op` ST(0)
+   Need to check ST(0)'s tag on read, but not on write.
+*/
+static
+void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf, 
+                            IROp op, Bool dbl )
+{
+   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
+   if (dbl) {
+      put_ST_UNCHECKED(0, 
+         triop( op, 
+                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                loadLE(Ity_F64,mkexpr(addr)),
+                get_ST(0)
+         ));
+   } else {
+      put_ST_UNCHECKED(0, 
+         triop( op, 
+                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
+                get_ST(0)
+         ));
+   }
+}
+
+
+/* ST(dst) = ST(dst) `op` ST(src).
+   Check dst and src tags when reading but not on write.
+*/
+static
+void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
+                      Bool pop_after )
+{
+   DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"", 
+                                 (Int)st_src, (Int)st_dst );
+   put_ST_UNCHECKED( 
+      st_dst, 
+      triop( op, 
+             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+             get_ST(st_dst), 
+             get_ST(st_src) ) 
+   );
+   if (pop_after)
+      fp_pop();
+}
+
+/* ST(dst) = ST(src) `op` ST(dst).
+   Check dst and src tags when reading but not on write.
+*/
+static
+void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
+                         Bool pop_after )
+{
+   DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
+                                 (Int)st_src, (Int)st_dst );
+   put_ST_UNCHECKED( 
+      st_dst, 
+      triop( op, 
+             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+             get_ST(st_src), 
+             get_ST(st_dst) ) 
+   );
+   if (pop_after)
+      fp_pop();
+}
+
+/* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
+static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
+{
+   DIP("fucomi%s %%st(0),%%st(%d)\n", pop_after ? "p" : "", (Int)i );
+   /* This is a bit of a hack (and isn't really right).  It sets
+      Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
+      documentation implies A and S are unchanged. 
+   */
+   /* It's also fishy in that it is used both for COMIP and
+      UCOMIP, and they aren't the same (although similar). */
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP1,
+                     binop( Iop_And32,
+                            binop(Iop_CmpF64, get_ST(0), get_ST(i)),
+                            mkU32(0x45)
+       )));
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+   if (pop_after)
+      fp_pop();
+}
+
+
+static
+UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
+{
+   Int    len;
+   UInt   r_src, r_dst;
+   HChar  dis_buf[50];
+   IRTemp t1, t2;
+
+   /* On entry, delta points at the second byte of the insn (the modrm
+      byte).*/
+   UChar first_opcode = getIByte(delta-1);
+   UChar modrm        = getIByte(delta+0);
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
+
+   if (first_opcode == 0xD8) {
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+           specifies an address. */
+         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+         delta += len;
+
+         switch (gregOfRM(modrm)) {
+
+            case 0: /* FADD single-real */
+               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
+               break;
+
+            case 1: /* FMUL single-real */
+               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
+               break;
+
+            case 2: /* FCOM single-real */
+               DIP("fcoms %s\n", dis_buf);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, 
+                                      get_ST(0),
+                                      unop(Iop_F32toF64, 
+                                           loadLE(Ity_F32,mkexpr(addr)))),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               break;  
+
+            case 3: /* FCOMP single-real */
+               DIP("fcomps %s\n", dis_buf);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, 
+                                      get_ST(0),
+                                      unop(Iop_F32toF64, 
+                                           loadLE(Ity_F32,mkexpr(addr)))),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               fp_pop();
+               break;  
+
+            case 4: /* FSUB single-real */
+               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
+               break;
+
+            case 5: /* FSUBR single-real */
+               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
+               break;
+
+            case 6: /* FDIV single-real */
+               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
+               break;
+
+            case 7: /* FDIVR single-real */
+               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
+               break;
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
+               vex_printf("first_opcode == 0xD8\n");
+               goto decode_fail;
+         }
+      } else {
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
+               fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
+               break;
+
+            case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
+               fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
+               break;
+
+            /* Dunno if this is right */
+            case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
+               r_dst = (UInt)modrm - 0xD0;
+               DIP("fcom %%st(0),%%st(%d)\n", (Int)r_dst);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               break;
+
+            /* Dunno if this is right */
+            case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
+               r_dst = (UInt)modrm - 0xD8;
+               DIP("fcomp %%st(0),%%st(%d)\n", (Int)r_dst);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               fp_pop();
+               break;
+
+            case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
+               fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
+               break;
+
+            case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
+               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
+               break;
+
+            case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
+               fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
+               break;
+
+            case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
+               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
+               break;
+
+            default:
+               goto decode_fail;
+         }
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xD9) {
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+         delta += len;
+
+         switch (gregOfRM(modrm)) {
+
+            case 0: /* FLD single-real */
+               DIP("flds %s\n", dis_buf);
+               fp_push();
+               put_ST(0, unop(Iop_F32toF64,
+                              loadLE(Ity_F32, mkexpr(addr))));
+               break;
+
+            case 2: /* FST single-real */
+               DIP("fsts %s\n", dis_buf);
+               storeLE(mkexpr(addr),
+                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
+               break;
+
+            case 3: /* FSTP single-real */
+               DIP("fstps %s\n", dis_buf);
+               storeLE(mkexpr(addr), 
+                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
+               fp_pop();
+               break;
+
+            case 4: { /* FLDENV m28 */
+               /* Uses dirty helper: 
+                     VexEmWarn x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
+               IRTemp   ew = newTemp(Ity_I32);
+               IRDirty* d  = unsafeIRDirty_0_N ( 
+                                0/*regparms*/, 
+                                "x86g_dirtyhelper_FLDENV", 
+                                &x86g_dirtyhelper_FLDENV,
+                                mkIRExprVec_1( mkexpr(addr) )
+                             );
+               d->needsBBP = True;
+               d->tmp      = ew;
+               /* declare we're reading memory */
+               d->mFx   = Ifx_Read;
+               d->mAddr = mkexpr(addr);
+               d->mSize = 28;
+
+               /* declare we're writing guest state */
+               d->nFxState = 4;
+
+               d->fxState[0].fx     = Ifx_Write;
+               d->fxState[0].offset = OFFB_FTOP;
+               d->fxState[0].size   = sizeof(UInt);
+
+               d->fxState[1].fx     = Ifx_Write;
+               d->fxState[1].offset = OFFB_FPTAGS;
+               d->fxState[1].size   = 8 * sizeof(UChar);
+
+               d->fxState[2].fx     = Ifx_Write;
+               d->fxState[2].offset = OFFB_FPROUND;
+               d->fxState[2].size   = sizeof(UInt);
+
+               d->fxState[3].fx     = Ifx_Write;
+               d->fxState[3].offset = OFFB_FC3210;
+               d->fxState[3].size   = sizeof(UInt);
+
+               stmt( IRStmt_Dirty(d) );
+
+               /* ew contains any emulation warning we may need to
+                  issue.  If needed, side-exit to the next insn,
+                  reporting the warning, so that Valgrind's dispatcher
+                  sees the warning. */
+               put_emwarn( mkexpr(ew) );
+               stmt( 
+                  IRStmt_Exit(
+                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
+                     Ijk_EmWarn,
+                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
+                  )
+               );
+
+               DIP("fldenv %s\n", dis_buf);
+               break;
+            }
+
+            case 5: {/* FLDCW */
+               /* The only thing we observe in the control word is the
+                  rounding mode.  Therefore, pass the 16-bit value
+                  (x87 native-format control word) to a clean helper,
+                  getting back a 64-bit value, the lower half of which
+                  is the FPROUND value to store, and the upper half of
+                  which is the emulation-warning token which may be
+                  generated.
+               */
+               /* ULong x86h_check_fldcw ( UInt ); */
+               IRTemp t64 = newTemp(Ity_I64);
+               IRTemp ew = newTemp(Ity_I32);
+               DIP("fldcw %s\n", dis_buf);
+               assign( t64, mkIRExprCCall(
+                               Ity_I64, 0/*regparms*/, 
+                               "x86g_check_fldcw",
+                               &x86g_check_fldcw, 
+                               mkIRExprVec_1( 
+                                  unop( Iop_16Uto32, 
+                                        loadLE(Ity_I16, mkexpr(addr)))
+                               )
+                            )
+                     );
+
+               put_fpround( unop(Iop_64to32, mkexpr(t64)) );
+               assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
+               put_emwarn( mkexpr(ew) );
+               /* Finally, if an emulation warning was reported,
+                  side-exit to the next insn, reporting the warning,
+                  so that Valgrind's dispatcher sees the warning. */
+               stmt( 
+                  IRStmt_Exit(
+                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
+                     Ijk_EmWarn,
+                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
+                  )
+               );
+               break;
+            }
+
+            case 6: { /* FNSTENV m28 */
+               /* Uses dirty helper: 
+                     void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
+               IRDirty* d = unsafeIRDirty_0_N ( 
+                               0/*regparms*/, 
+                               "x86g_dirtyhelper_FSTENV", 
+                               &x86g_dirtyhelper_FSTENV,
+                               mkIRExprVec_1( mkexpr(addr) )
+                            );
+               d->needsBBP = True;
+               /* declare we're writing memory */
+               d->mFx   = Ifx_Write;
+               d->mAddr = mkexpr(addr);
+               d->mSize = 28;
+
+               /* declare we're reading guest state */
+               d->nFxState = 4;
+
+               d->fxState[0].fx     = Ifx_Read;
+               d->fxState[0].offset = OFFB_FTOP;
+               d->fxState[0].size   = sizeof(UInt);
+
+               d->fxState[1].fx     = Ifx_Read;
+               d->fxState[1].offset = OFFB_FPTAGS;
+               d->fxState[1].size   = 8 * sizeof(UChar);
+
+               d->fxState[2].fx     = Ifx_Read;
+               d->fxState[2].offset = OFFB_FPROUND;
+               d->fxState[2].size   = sizeof(UInt);
+
+               d->fxState[3].fx     = Ifx_Read;
+               d->fxState[3].offset = OFFB_FC3210;
+               d->fxState[3].size   = sizeof(UInt);
+
+               stmt( IRStmt_Dirty(d) );
+
+               DIP("fnstenv %s\n", dis_buf);
+               break;
+            }
+
+            case 7: /* FNSTCW */
+              /* Fake up a native x87 FPU control word.  The only
+                 thing it depends on is FPROUND[1:0], so call a clean
+                 helper to cook it up. */
+               /* UInt x86h_create_fpucw ( UInt fpround ) */
+               DIP("fnstcw %s\n", dis_buf);
+               storeLE(
+                  mkexpr(addr), 
+                  unop( Iop_32to16, 
+                        mkIRExprCCall(
+                           Ity_I32, 0/*regp*/,
+                           "x86g_create_fpucw", &x86g_create_fpucw, 
+                           mkIRExprVec_1( get_fpround() ) 
+                        ) 
+                  ) 
+               );
+               break;
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
+               vex_printf("first_opcode == 0xD9\n");
+               goto decode_fail;
+         }
+
+      } else {
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FLD %st(?) */
+               r_src = (UInt)modrm - 0xC0;
+               DIP("fld %%st(%d)\n", (Int)r_src);
+               t1 = newTemp(Ity_F64);
+               assign(t1, get_ST(r_src));
+               fp_push();
+               put_ST(0, mkexpr(t1));
+               break;
+
+            case 0xC8 ... 0xCF: /* FXCH %st(?) */
+               r_src = (UInt)modrm - 0xC8;
+               DIP("fxch %%st(%d)\n", (Int)r_src);
+               t1 = newTemp(Ity_F64);
+               t2 = newTemp(Ity_F64);
+               assign(t1, get_ST(0));
+               assign(t2, get_ST(r_src));
+               put_ST_UNCHECKED(0, mkexpr(t2));
+               put_ST_UNCHECKED(r_src, mkexpr(t1));
+               break;
+
+            case 0xE0: /* FCHS */
+               DIP("fchs\n");
+               put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
+               break;
+
+            case 0xE1: /* FABS */
+               DIP("fabs\n");
+               put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
+               break;
+
+            case 0xE4: /* FTST */
+               DIP("ftst\n");
+               /* This forces C1 to zero, which isn't right. */
+               /* Well, in fact the Intel docs say (bizarrely): "C1 is
+                  set to 0 if stack underflow occurred; otherwise, set
+                  to 0" which is pretty nonsensical.  I guess it's a
+                   typo. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, 
+                                      get_ST(0),
+                                      IRExpr_Const(IRConst_F64i(0x0ULL))),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               break;
+
+            case 0xE5: { /* FXAM */
+               /* This is an interesting one.  It examines %st(0),
+                  regardless of whether the tag says it's empty or not.
+                  Here, just pass both the tag (in our format) and the
+                  value (as a double, actually a ULong) to a helper
+                  function. */
+               IRExpr** args
+                  = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
+                                   unop(Iop_ReinterpF64asI64, 
+                                        get_ST_UNCHECKED(0)) );
+               put_C3210(mkIRExprCCall(
+                            Ity_I32, 
+                            0/*regparm*/, 
+                            "x86g_calculate_FXAM", &x86g_calculate_FXAM,
+                            args
+                        ));
+               DIP("fxam\n");
+               break;
+            }
+
+            case 0xE8: /* FLD1 */
+               DIP("fld1\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
+               break;
+
+            case 0xE9: /* FLDL2T */
+               DIP("fldl2t\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
+               break;
+
+            case 0xEA: /* FLDL2E */
+               DIP("fldl2e\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
+               break;
+
+            case 0xEB: /* FLDPI */
+               DIP("fldpi\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
+               break;
+
+            case 0xEC: /* FLDLG2 */
+               DIP("fldlg2\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
+               break;
+
+            case 0xED: /* FLDLN2 */
+               DIP("fldln2\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
+               break;
+
+            case 0xEE: /* FLDZ */
+               DIP("fldz\n");
+               fp_push();
+               /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
+               put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
+               break;
+
+            case 0xF0: /* F2XM1 */
+               DIP("f2xm1\n");
+               put_ST_UNCHECKED(0, 
+                  binop(Iop_2xm1F64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0)));
+               break;
+
+            case 0xF1: /* FYL2X */
+               DIP("fyl2x\n");
+               put_ST_UNCHECKED(1, 
+                  triop(Iop_Yl2xF64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(1), 
+                        get_ST(0)));
+               fp_pop();
+               break;
+
+            case 0xF2: /* FPTAN */
+               DIP("ftan\n");
+               put_ST_UNCHECKED(0, 
+                  binop(Iop_TanF64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0)));
+               fp_push();
+               put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
+               clear_C2(); /* HACK */
+               break;
+
+            case 0xF3: /* FPATAN */
+               DIP("fpatan\n");
+               put_ST_UNCHECKED(1, 
+                  triop(Iop_AtanF64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(1), 
+                        get_ST(0)));
+               fp_pop();
+               break;
+
+            case 0xF4: { /* FXTRACT */
+               IRTemp argF = newTemp(Ity_F64);
+               IRTemp sigF = newTemp(Ity_F64);
+               IRTemp expF = newTemp(Ity_F64);
+               IRTemp argI = newTemp(Ity_I64);
+               IRTemp sigI = newTemp(Ity_I64);
+               IRTemp expI = newTemp(Ity_I64);
+               DIP("fxtract\n");
+               assign( argF, get_ST(0) );
+               assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
+               assign( sigI, 
+                       mkIRExprCCall(
+                          Ity_I64, 0/*regparms*/, 
+                          "x86amd64g_calculate_FXTRACT", 
+                          &x86amd64g_calculate_FXTRACT, 
+                          mkIRExprVec_2( mkexpr(argI), 
+                                         mkIRExpr_HWord(0)/*sig*/ )) 
+               );
+               assign( expI, 
+                       mkIRExprCCall(
+                          Ity_I64, 0/*regparms*/, 
+                          "x86amd64g_calculate_FXTRACT", 
+                          &x86amd64g_calculate_FXTRACT, 
+                          mkIRExprVec_2( mkexpr(argI), 
+                                         mkIRExpr_HWord(1)/*exp*/ )) 
+               );
+               assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
+               assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
+               /* exponent */
+               put_ST_UNCHECKED(0, mkexpr(expF) );
+               fp_push();
+               /* significand */
+               put_ST(0, mkexpr(sigF) );
+               break;
+            }
+
+            case 0xF5: { /* FPREM1 -- IEEE compliant */
+               IRTemp a1 = newTemp(Ity_F64);
+               IRTemp a2 = newTemp(Ity_F64);
+               DIP("fprem1\n");
+               /* Do FPREM1 twice, once to get the remainder, and once
+                  to get the C3210 flag values. */
+               assign( a1, get_ST(0) );
+               assign( a2, get_ST(1) );
+               put_ST_UNCHECKED(0, 
+                  triop(Iop_PRem1F64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        mkexpr(a1), 
+                        mkexpr(a2)));
+               put_C3210(
+                  triop(Iop_PRem1C3210F64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        mkexpr(a1), 
+                        mkexpr(a2)) );
+               break;
+            }
+
+            case 0xF7: /* FINCSTP */
+               DIP("fprem\n");
+               put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
+               break;
+
+            case 0xF8: { /* FPREM -- not IEEE compliant */
+               IRTemp a1 = newTemp(Ity_F64);
+               IRTemp a2 = newTemp(Ity_F64);
+               DIP("fprem\n");
+               /* Do FPREM twice, once to get the remainder, and once
+                  to get the C3210 flag values. */
+               assign( a1, get_ST(0) );
+               assign( a2, get_ST(1) );
+               put_ST_UNCHECKED(0, 
+                  triop(Iop_PRemF64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        mkexpr(a1), 
+                        mkexpr(a2)));
+               put_C3210( 
+                  triop(Iop_PRemC3210F64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        mkexpr(a1), 
+                        mkexpr(a2)) );
+               break;
+            }
+
+            case 0xF9: /* FYL2XP1 */
+               DIP("fyl2xp1\n");
+               put_ST_UNCHECKED(1, 
+                  triop(Iop_Yl2xp1F64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(1), 
+                        get_ST(0)));
+               fp_pop();
+               break;
+
+            case 0xFA: /* FSQRT */
+               DIP("fsqrt\n");
+               put_ST_UNCHECKED(0, 
+                  binop(Iop_SqrtF64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0)));
+               break;
+
+            case 0xFB: { /* FSINCOS */
+               IRTemp a1 = newTemp(Ity_F64);
+               assign( a1, get_ST(0) );
+               DIP("fsincos\n");
+               put_ST_UNCHECKED(0, 
+                  binop(Iop_SinF64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        mkexpr(a1)));
+               fp_push();
+               put_ST(0, 
+                  binop(Iop_CosF64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        mkexpr(a1)));
+               clear_C2(); /* HACK */
+               break;
+            }
+
+            case 0xFC: /* FRNDINT */
+               DIP("frndint\n");
+               put_ST_UNCHECKED(0,
+                  binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
+               break;
+
+            case 0xFD: /* FSCALE */
+               DIP("fscale\n");
+               put_ST_UNCHECKED(0, 
+                  triop(Iop_ScaleF64,
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0), 
+                        get_ST(1)));
+               break;
+
+            case 0xFE: /* FSIN */
+               DIP("fsin\n");
+               put_ST_UNCHECKED(0, 
+                  binop(Iop_SinF64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0)));
+               clear_C2(); /* HACK */
+               break;
+
+            case 0xFF: /* FCOS */
+               DIP("fcos\n");
+               put_ST_UNCHECKED(0, 
+                  binop(Iop_CosF64, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0)));
+               clear_C2(); /* HACK */
+               break;
+
+            default:
+               goto decode_fail;
+         }
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDA) {
+
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IROp   fop;
+         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+         delta += len;
+         switch (gregOfRM(modrm)) {
+
+            case 0: /* FIADD m32int */ /* ST(0) += m32int */
+               DIP("fiaddl %s\n", dis_buf);
+               fop = Iop_AddF64;
+               goto do_fop_m32;
+
+            case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
+               DIP("fimull %s\n", dis_buf);
+               fop = Iop_MulF64;
+               goto do_fop_m32;
+
+            case 2: /* FICOM m32int */
+               DIP("ficoml %s\n", dis_buf);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, 
+                                      get_ST(0),
+                                      unop(Iop_I32StoF64, 
+                                           loadLE(Ity_I32,mkexpr(addr)))),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               break;
+
+            case 3: /* FICOMP m32int */
+               DIP("ficompl %s\n", dis_buf);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, 
+                                      get_ST(0),
+                                      unop(Iop_I32StoF64, 
+                                           loadLE(Ity_I32,mkexpr(addr)))),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               fp_pop();
+               break;
+
+            case 4: /* FISUB m32int */ /* ST(0) -= m32int */
+               DIP("fisubl %s\n", dis_buf);
+               fop = Iop_SubF64;
+               goto do_fop_m32;
+
+            case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
+               DIP("fisubrl %s\n", dis_buf);
+               fop = Iop_SubF64;
+               goto do_foprev_m32;
+
+            case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
+               DIP("fidivl %s\n", dis_buf);
+               fop = Iop_DivF64;
+               goto do_fop_m32;
+
+            case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
+               DIP("fidivrl %s\n", dis_buf);
+               fop = Iop_DivF64;
+               goto do_foprev_m32;
+
+            do_fop_m32:
+               put_ST_UNCHECKED(0, 
+                  triop(fop, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0),
+                        unop(Iop_I32StoF64,
+                             loadLE(Ity_I32, mkexpr(addr)))));
+               break;
+
+            do_foprev_m32:
+               put_ST_UNCHECKED(0, 
+                  triop(fop, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        unop(Iop_I32StoF64,
+                             loadLE(Ity_I32, mkexpr(addr))),
+                        get_ST(0)));
+               break;
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
+               vex_printf("first_opcode == 0xDA\n");
+               goto decode_fail;
+         }
+
+      } else {
+
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xC0;
+               DIP("fcmovb %%st(%d), %%st(0)\n", (Int)r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_x86g_calculate_condition(X86CondB)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xC8;
+               DIP("fcmovz %%st(%d), %%st(0)\n", (Int)r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_x86g_calculate_condition(X86CondZ)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xD0;
+               DIP("fcmovbe %%st(%d), %%st(0)\n", (Int)r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_x86g_calculate_condition(X86CondBE)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xD8;
+               DIP("fcmovu %%st(%d), %%st(0)\n", (Int)r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_x86g_calculate_condition(X86CondP)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xE9: /* FUCOMPP %st(0),%st(1) */
+               DIP("fucompp %%st(0),%%st(1)\n");
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               fp_pop();
+               fp_pop();
+               break;
+
+            default:
+               goto decode_fail;
+         }
+
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDB) {
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+         delta += len;
+
+         switch (gregOfRM(modrm)) {
+
+            case 0: /* FILD m32int */
+               DIP("fildl %s\n", dis_buf);
+               fp_push();
+               put_ST(0, unop(Iop_I32StoF64,
+                              loadLE(Ity_I32, mkexpr(addr))));
+               break;
+
+            case 1: /* FISTTPL m32 (SSE3) */
+               DIP("fisttpl %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
+               fp_pop();
+               break;
+
+            case 2: /* FIST m32 */
+               DIP("fistl %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
+               break;
+
+            case 3: /* FISTP m32 */
+               DIP("fistpl %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
+               fp_pop();
+               break;
+
+            case 5: { /* FLD extended-real */
+               /* Uses dirty helper: 
+                     ULong x86g_loadF80le ( UInt )
+                  addr holds the address.  First, do a dirty call to
+                  get hold of the data. */
+               IRTemp   val  = newTemp(Ity_I64);
+               IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
+
+               IRDirty* d = unsafeIRDirty_1_N ( 
+                               val, 
+                               0/*regparms*/, 
+                               "x86g_dirtyhelper_loadF80le", 
+                               &x86g_dirtyhelper_loadF80le, 
+                               args 
+                            );
+               /* declare that we're reading memory */
+               d->mFx   = Ifx_Read;
+               d->mAddr = mkexpr(addr);
+               d->mSize = 10;
+
+               /* execute the dirty call, dumping the result in val. */
+               stmt( IRStmt_Dirty(d) );
+               fp_push();
+               put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
+
+               DIP("fldt %s\n", dis_buf);
+               break;
+            }
+
+            case 7: { /* FSTP extended-real */
+               /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
+               IRExpr** args 
+                  = mkIRExprVec_2( mkexpr(addr), 
+                                   unop(Iop_ReinterpF64asI64, get_ST(0)) );
+
+               IRDirty* d = unsafeIRDirty_0_N ( 
+                               0/*regparms*/, 
+                               "x86g_dirtyhelper_storeF80le", 
+                               &x86g_dirtyhelper_storeF80le,
+                               args 
+                            );
+               /* declare we're writing memory */
+               d->mFx   = Ifx_Write;
+               d->mAddr = mkexpr(addr);
+               d->mSize = 10;
+
+               /* execute the dirty call. */
+               stmt( IRStmt_Dirty(d) );
+               fp_pop();
+
+               DIP("fstpt\n %s", dis_buf);
+               break;
+            }
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
+               vex_printf("first_opcode == 0xDB\n");
+               goto decode_fail;
+         }
+
+      } else {
+
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xC0;
+               DIP("fcmovnb %%st(%d), %%st(0)\n", (Int)r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_x86g_calculate_condition(X86CondNB)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xC8;
+               DIP("fcmovnz %%st(%d), %%st(0)\n", (Int)r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_x86g_calculate_condition(X86CondNZ)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xD0;
+               DIP("fcmovnbe %%st(%d), %%st(0)\n", (Int)r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_x86g_calculate_condition(X86CondNBE)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
+               r_src = (UInt)modrm - 0xD8;
+               DIP("fcmovnu %%st(%d), %%st(0)\n", (Int)r_src);
+               put_ST_UNCHECKED(0, 
+                                IRExpr_Mux0X( 
+                                    unop(Iop_1Uto8,
+                                         mk_x86g_calculate_condition(X86CondNP)), 
+                                    get_ST(0), get_ST(r_src)) );
+               break;
+
+            case 0xE2:
+               DIP("fnclex\n");
+               break;
+
+            case 0xE3: {
+               /* Uses dirty helper: 
+                     void x86g_do_FINIT ( VexGuestX86State* ) */
+               IRDirty* d  = unsafeIRDirty_0_N ( 
+                                0/*regparms*/, 
+                                "x86g_dirtyhelper_FINIT", 
+                                &x86g_dirtyhelper_FINIT,
+                                mkIRExprVec_0()
+                             );
+               d->needsBBP = True;
+
+               /* declare we're writing guest state */
+               d->nFxState = 5;
+
+               d->fxState[0].fx     = Ifx_Write;
+               d->fxState[0].offset = OFFB_FTOP;
+               d->fxState[0].size   = sizeof(UInt);
+
+               d->fxState[1].fx     = Ifx_Write;
+               d->fxState[1].offset = OFFB_FPREGS;
+               d->fxState[1].size   = 8 * sizeof(ULong);
+
+               d->fxState[2].fx     = Ifx_Write;
+               d->fxState[2].offset = OFFB_FPTAGS;
+               d->fxState[2].size   = 8 * sizeof(UChar);
+
+               d->fxState[3].fx     = Ifx_Write;
+               d->fxState[3].offset = OFFB_FPROUND;
+               d->fxState[3].size   = sizeof(UInt);
+
+               d->fxState[4].fx     = Ifx_Write;
+               d->fxState[4].offset = OFFB_FC3210;
+               d->fxState[4].size   = sizeof(UInt);
+
+               stmt( IRStmt_Dirty(d) );
+
+               DIP("fninit\n");
+               break;
+            }
+
+            case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
+               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
+               break;
+
+            case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
+               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
+               break;
+
+            default:
+               goto decode_fail;
+         }
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDC) {
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+         delta += len;
+
+         switch (gregOfRM(modrm)) {
+
+            case 0: /* FADD double-real */
+               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
+               break;
+
+            case 1: /* FMUL double-real */
+               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
+               break;
+
+            case 2: /* FCOM double-real */
+               DIP("fcoml %s\n", dis_buf);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, 
+                                      get_ST(0),
+                                      loadLE(Ity_F64,mkexpr(addr))),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               break;  
+
+            case 3: /* FCOMP double-real */
+               DIP("fcompl %s\n", dis_buf);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, 
+                                      get_ST(0),
+                                      loadLE(Ity_F64,mkexpr(addr))),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               fp_pop();
+               break;  
+
+            case 4: /* FSUB double-real */
+               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
+               break;
+
+            case 5: /* FSUBR double-real */
+               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
+               break;
+
+            case 6: /* FDIV double-real */
+               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
+               break;
+
+            case 7: /* FDIVR double-real */
+               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
+               break;
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
+               vex_printf("first_opcode == 0xDC\n");
+               goto decode_fail;
+         }
+
+      } else {
+
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
+               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
+               break;
+
+            case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
+               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
+               break;
+
+            case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
+               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
+               break;
+
+            case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
+               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
+               break;
+
+            case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
+               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
+               break;
+
+            case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
+               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
+               break;
+
+            default:
+               goto decode_fail;
+         }
+
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDD) {
+
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+         delta += len;
+
+         switch (gregOfRM(modrm)) {
+
+            case 0: /* FLD double-real */
+               DIP("fldl %s\n", dis_buf);
+               fp_push();
+               put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
+               break;
+
+            case 1: /* FISTTPQ m64 (SSE3) */
+               DIP("fistppll %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
+               fp_pop();
+               break;
+
+            case 2: /* FST double-real */
+               DIP("fstl %s\n", dis_buf);
+               storeLE(mkexpr(addr), get_ST(0));
+               break;
+
+            case 3: /* FSTP double-real */
+               DIP("fstpl %s\n", dis_buf);
+               storeLE(mkexpr(addr), get_ST(0));
+               fp_pop();
+               break;
+
+            case 4: { /* FRSTOR m108 */
+               /* Uses dirty helper: 
+                     VexEmWarn x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
+               IRTemp   ew = newTemp(Ity_I32);
+               IRDirty* d  = unsafeIRDirty_0_N ( 
+                                0/*regparms*/, 
+                                "x86g_dirtyhelper_FRSTOR", 
+                                &x86g_dirtyhelper_FRSTOR,
+                                mkIRExprVec_1( mkexpr(addr) )
+                             );
+               d->needsBBP = True;
+               d->tmp      = ew;
+               /* declare we're reading memory */
+               d->mFx   = Ifx_Read;
+               d->mAddr = mkexpr(addr);
+               d->mSize = 108;
+
+               /* declare we're writing guest state */
+               d->nFxState = 5;
+
+               d->fxState[0].fx     = Ifx_Write;
+               d->fxState[0].offset = OFFB_FTOP;
+               d->fxState[0].size   = sizeof(UInt);
+
+               d->fxState[1].fx     = Ifx_Write;
+               d->fxState[1].offset = OFFB_FPREGS;
+               d->fxState[1].size   = 8 * sizeof(ULong);
+
+               d->fxState[2].fx     = Ifx_Write;
+               d->fxState[2].offset = OFFB_FPTAGS;
+               d->fxState[2].size   = 8 * sizeof(UChar);
+
+               d->fxState[3].fx     = Ifx_Write;
+               d->fxState[3].offset = OFFB_FPROUND;
+               d->fxState[3].size   = sizeof(UInt);
+
+               d->fxState[4].fx     = Ifx_Write;
+               d->fxState[4].offset = OFFB_FC3210;
+               d->fxState[4].size   = sizeof(UInt);
+
+               stmt( IRStmt_Dirty(d) );
+
+               /* ew contains any emulation warning we may need to
+                  issue.  If needed, side-exit to the next insn,
+                  reporting the warning, so that Valgrind's dispatcher
+                  sees the warning. */
+               put_emwarn( mkexpr(ew) );
+               stmt( 
+                  IRStmt_Exit(
+                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
+                     Ijk_EmWarn,
+                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
+                  )
+               );
+
+               DIP("frstor %s\n", dis_buf);
+               break;
+            }
+
+            case 6: { /* FNSAVE m108 */
+               /* Uses dirty helper: 
+                     void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
+               IRDirty* d = unsafeIRDirty_0_N ( 
+                               0/*regparms*/, 
+                               "x86g_dirtyhelper_FSAVE", 
+                               &x86g_dirtyhelper_FSAVE,
+                               mkIRExprVec_1( mkexpr(addr) )
+                            );
+               d->needsBBP = True;
+               /* declare we're writing memory */
+               d->mFx   = Ifx_Write;
+               d->mAddr = mkexpr(addr);
+               d->mSize = 108;
+
+               /* declare we're reading guest state */
+               d->nFxState = 5;
+
+               d->fxState[0].fx     = Ifx_Read;
+               d->fxState[0].offset = OFFB_FTOP;
+               d->fxState[0].size   = sizeof(UInt);
+
+               d->fxState[1].fx     = Ifx_Read;
+               d->fxState[1].offset = OFFB_FPREGS;
+               d->fxState[1].size   = 8 * sizeof(ULong);
+
+               d->fxState[2].fx     = Ifx_Read;
+               d->fxState[2].offset = OFFB_FPTAGS;
+               d->fxState[2].size   = 8 * sizeof(UChar);
+
+               d->fxState[3].fx     = Ifx_Read;
+               d->fxState[3].offset = OFFB_FPROUND;
+               d->fxState[3].size   = sizeof(UInt);
+
+               d->fxState[4].fx     = Ifx_Read;
+               d->fxState[4].offset = OFFB_FC3210;
+               d->fxState[4].size   = sizeof(UInt);
+
+               stmt( IRStmt_Dirty(d) );
+
+               DIP("fnsave %s\n", dis_buf);
+               break;
+            }
+
+            case 7: { /* FNSTSW m16 */
+               IRExpr* sw = get_FPU_sw();
+               vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
+               storeLE( mkexpr(addr), sw );
+               DIP("fnstsw %s\n", dis_buf);
+               break;
+            }
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
+               vex_printf("first_opcode == 0xDD\n");
+               goto decode_fail;
+         }
+      } else {
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FFREE %st(?) */
+               r_dst = (UInt)modrm - 0xC0;
+               DIP("ffree %%st(%d)\n", (Int)r_dst);
+               put_ST_TAG ( r_dst, mkU8(0) );
+               break;
+
+            case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
+               r_dst = (UInt)modrm - 0xD0;
+               DIP("fst %%st(0),%%st(%d)\n", (Int)r_dst);
+               /* P4 manual says: "If the destination operand is a
+                  non-empty register, the invalid-operation exception
+                  is not generated.  Hence put_ST_UNCHECKED. */
+               put_ST_UNCHECKED(r_dst, get_ST(0));
+               break;
+
+            case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
+               r_dst = (UInt)modrm - 0xD8;
+               DIP("fstp %%st(0),%%st(%d)\n", (Int)r_dst);
+               /* P4 manual says: "If the destination operand is a
+                  non-empty register, the invalid-operation exception
+                  is not generated.  Hence put_ST_UNCHECKED. */
+               put_ST_UNCHECKED(r_dst, get_ST(0));
+               fp_pop();
+               break;
+
+            case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
+               r_dst = (UInt)modrm - 0xE0;
+               DIP("fucom %%st(0),%%st(%d)\n", (Int)r_dst);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               break;
+
+            case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
+               r_dst = (UInt)modrm - 0xE8;
+               DIP("fucomp %%st(0),%%st(%d)\n", (Int)r_dst);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               fp_pop();
+               break;
+
+            default:
+               goto decode_fail;
+         }
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDE) {
+
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IROp   fop;
+         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+         delta += len;
+
+         switch (gregOfRM(modrm)) {
+
+            case 0: /* FIADD m16int */ /* ST(0) += m16int */
+               DIP("fiaddw %s\n", dis_buf);
+               fop = Iop_AddF64;
+               goto do_fop_m16;
+
+            case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
+               DIP("fimulw %s\n", dis_buf);
+               fop = Iop_MulF64;
+               goto do_fop_m16;
+
+            case 2: /* FICOM m16int */
+               DIP("ficomw %s\n", dis_buf);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, 
+                                      get_ST(0),
+                                      unop(Iop_I32StoF64, 
+                                         unop(Iop_16Sto32,
+                                           loadLE(Ity_I16,mkexpr(addr))))),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               break;
+
+            case 3: /* FICOMP m16int */
+               DIP("ficompw %s\n", dis_buf);
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, 
+                                      get_ST(0),
+                                      unop(Iop_I32StoF64, 
+                                         unop(Iop_16Sto32,
+                                              loadLE(Ity_I16,mkexpr(addr))))),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               fp_pop();
+               break;
+
+            case 4: /* FISUB m16int */ /* ST(0) -= m16int */
+               DIP("fisubw %s\n", dis_buf);
+               fop = Iop_SubF64;
+               goto do_fop_m16;
+
+            case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
+               DIP("fisubrw %s\n", dis_buf);
+               fop = Iop_SubF64;
+               goto do_foprev_m16;
+
+            case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
+               DIP("fisubw %s\n", dis_buf);
+               fop = Iop_DivF64;
+               goto do_fop_m16;
+
+            case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
+               DIP("fidivrw %s\n", dis_buf);
+               fop = Iop_DivF64;
+               goto do_foprev_m16;
+
+            do_fop_m16:
+               put_ST_UNCHECKED(0, 
+                  triop(fop, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        get_ST(0),
+                        unop(Iop_I32StoF64,
+                             unop(Iop_16Sto32, 
+                                  loadLE(Ity_I16, mkexpr(addr))))));
+               break;
+
+            do_foprev_m16:
+               put_ST_UNCHECKED(0, 
+                  triop(fop, 
+                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
+                        unop(Iop_I32StoF64,
+                             unop(Iop_16Sto32, 
+                                  loadLE(Ity_I16, mkexpr(addr)))),
+                        get_ST(0)));
+               break;
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
+               vex_printf("first_opcode == 0xDE\n");
+               goto decode_fail;
+         }
+
+      } else {
+
+         delta++;
+         switch (modrm) {
+
+            case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
+               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
+               break;
+
+            case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
+               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
+               break;
+
+            case 0xD9: /* FCOMPP %st(0),%st(1) */
+               DIP("fuompp %%st(0),%%st(1)\n");
+               /* This forces C1 to zero, which isn't right. */
+               put_C3210( 
+                   binop( Iop_And32,
+                          binop(Iop_Shl32, 
+                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
+                                mkU8(8)),
+                          mkU32(0x4500)
+                   ));
+               fp_pop();
+               fp_pop();
+               break;
+
+            case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
+               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
+               break;
+
+            case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
+               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
+               break;
+
+            case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
+               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
+               break;
+
+            case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
+               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
+               break;
+
+            default: 
+               goto decode_fail;
+         }
+
+      }
+   }
+
+   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
+   else
+   if (first_opcode == 0xDF) {
+
+      if (modrm < 0xC0) {
+
+         /* bits 5,4,3 are an opcode extension, and the modRM also
+            specifies an address. */
+         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+         delta += len;
+
+         switch (gregOfRM(modrm)) {
+
+            case 0: /* FILD m16int */
+               DIP("fildw %s\n", dis_buf);
+               fp_push();
+               put_ST(0, unop(Iop_I32StoF64,
+                              unop(Iop_16Sto32,
+                                   loadLE(Ity_I16, mkexpr(addr)))));
+               break;
+
+            case 1: /* FISTTPS m16 (SSE3) */
+               DIP("fisttps %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
+               fp_pop();
+               break;
+
+            case 2: /* FIST m16 */
+               DIP("fistp %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
+               break;
+
+            case 3: /* FISTP m16 */
+               DIP("fistps %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
+               fp_pop();
+               break;
+
+            case 5: /* FILD m64 */
+               DIP("fildll %s\n", dis_buf);
+               fp_push();
+               put_ST(0, binop(Iop_I64StoF64,
+                               get_roundingmode(),
+                               loadLE(Ity_I64, mkexpr(addr))));
+               break;
+
+            case 7: /* FISTP m64 */
+               DIP("fistpll %s\n", dis_buf);
+               storeLE( mkexpr(addr), 
+                        binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
+               fp_pop();
+               break;
+
+            default:
+               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
+               vex_printf("first_opcode == 0xDF\n");
+               goto decode_fail;
+         }
+
+      } else {
+
+         delta++;
+         switch (modrm) {
+
+            case 0xC0: /* FFREEP %st(0) */
+               DIP("ffreep %%st(%d)\n", 0);
+               put_ST_TAG ( 0, mkU8(0) );
+               fp_pop();
+               break;
+
+            case 0xE0: /* FNSTSW %ax */
+               DIP("fnstsw %%ax\n");
+               /* Get the FPU status word value and dump it in %AX. */
+               if (0) {
+                  /* The obvious thing to do is simply dump the 16-bit
+                     status word value in %AX.  However, due to a
+                     limitation in Memcheck's origin tracking
+                     machinery, this causes Memcheck not to track the
+                     origin of any undefinedness into %AH (only into
+                     %AL/%AX/%EAX), which means origins are lost in
+                     the sequence "fnstsw %ax; test $M,%ah; jcond .." */
+                  putIReg(2, R_EAX, get_FPU_sw());
+               } else {
+                  /* So a somewhat lame kludge is to make it very
+                     clear to Memcheck that the value is written to
+                     both %AH and %AL.  This generates marginally
+                     worse code, but I don't think it matters much. */
+                  IRTemp t16 = newTemp(Ity_I16);
+                  assign(t16, get_FPU_sw());
+                  putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
+                  putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
+               }
+               break;
+
+            case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
+               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
+               break;
+
+            case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
+               /* not really right since COMIP != UCOMIP */
+               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
+               break;
+
+            default: 
+               goto decode_fail;
+         }
+      }
+
+   }
+
+   else
+   vpanic("dis_FPU(x86): invalid primary opcode");
+
+   *decode_ok = True;
+   return delta;
+
+  decode_fail:
+   *decode_ok = False;
+   return delta;
+}
+
+
+/*------------------------------------------------------------*/
+/*---                                                      ---*/
+/*--- MMX INSTRUCTIONS                                     ---*/
+/*---                                                      ---*/
+/*------------------------------------------------------------*/
+
+/* Effect of MMX insns on x87 FPU state (table 11-2 of 
+   IA32 arch manual, volume 3):
+
+   Read from, or write to MMX register (viz, any insn except EMMS):
+   * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
+   * FP stack pointer set to zero
+
+   EMMS:
+   * All tags set to Invalid (empty) -- FPTAGS[i] := zero
+   * FP stack pointer set to zero
+*/
+
+static void do_MMX_preamble ( void )
+{
+   Int         i;
+   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
+   IRExpr*     zero  = mkU32(0);
+   IRExpr*     tag1  = mkU8(1);
+   put_ftop(zero);
+   for (i = 0; i < 8; i++)
+      stmt( IRStmt_PutI( descr, zero, i, tag1 ) );
+}
+
+static void do_EMMS_preamble ( void )
+{
+   Int         i;
+   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
+   IRExpr*     zero  = mkU32(0);
+   IRExpr*     tag0  = mkU8(0);
+   put_ftop(zero);
+   for (i = 0; i < 8; i++)
+      stmt( IRStmt_PutI( descr, zero, i, tag0 ) );
+}
+
+
+static IRExpr* getMMXReg ( UInt archreg )
+{
+   vassert(archreg < 8);
+   return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
+}
+
+
+static void putMMXReg ( UInt archreg, IRExpr* e )
+{
+   vassert(archreg < 8);
+   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
+   stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
+}
+
+
+/* Helper for non-shift MMX insns.  Note this is incomplete in the
+   sense that it does not first call do_MMX_preamble() -- that is the
+   responsibility of its caller. */
+
+static 
+UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
+                               Int    delta,
+                               UChar  opc,
+                               HChar* name,
+                               Bool   show_granularity )
+{
+   HChar   dis_buf[50];
+   UChar   modrm = getIByte(delta);
+   Bool    isReg = epartIsReg(modrm);
+   IRExpr* argL  = NULL;
+   IRExpr* argR  = NULL;
+   IRExpr* argG  = NULL;
+   IRExpr* argE  = NULL;
+   IRTemp  res   = newTemp(Ity_I64);
+
+   Bool    invG  = False;
+   IROp    op    = Iop_INVALID;
+   void*   hAddr = NULL;
+   HChar*  hName = NULL;
+   Bool    eLeft = False;
+
+#  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
+
+   switch (opc) {
+      /* Original MMX ones */
+      case 0xFC: op = Iop_Add8x8; break;
+      case 0xFD: op = Iop_Add16x4; break;
+      case 0xFE: op = Iop_Add32x2; break;
+
+      case 0xEC: op = Iop_QAdd8Sx8; break;
+      case 0xED: op = Iop_QAdd16Sx4; break;
+
+      case 0xDC: op = Iop_QAdd8Ux8; break;
+      case 0xDD: op = Iop_QAdd16Ux4; break;
+
+      case 0xF8: op = Iop_Sub8x8;  break;
+      case 0xF9: op = Iop_Sub16x4; break;
+      case 0xFA: op = Iop_Sub32x2; break;
+
+      case 0xE8: op = Iop_QSub8Sx8; break;
+      case 0xE9: op = Iop_QSub16Sx4; break;
+
+      case 0xD8: op = Iop_QSub8Ux8; break;
+      case 0xD9: op = Iop_QSub16Ux4; break;
+
+      case 0xE5: op = Iop_MulHi16Sx4; break;
+      case 0xD5: op = Iop_Mul16x4; break;
+      case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
+
+      case 0x74: op = Iop_CmpEQ8x8; break;
+      case 0x75: op = Iop_CmpEQ16x4; break;
+      case 0x76: op = Iop_CmpEQ32x2; break;
+
+      case 0x64: op = Iop_CmpGT8Sx8; break;
+      case 0x65: op = Iop_CmpGT16Sx4; break;
+      case 0x66: op = Iop_CmpGT32Sx2; break;
+
+      case 0x6B: op = Iop_QNarrow32Sx2; eLeft = True; break;
+      case 0x63: op = Iop_QNarrow16Sx4; eLeft = True; break;
+      case 0x67: op = Iop_QNarrow16Ux4; eLeft = True; break;
+
+      case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
+      case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
+      case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
+
+      case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
+      case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
+      case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
+
+      case 0xDB: op = Iop_And64; break;
+      case 0xDF: op = Iop_And64; invG = True; break;
+      case 0xEB: op = Iop_Or64; break;
+      case 0xEF: /* Possibly do better here if argL and argR are the
+                    same reg */
+                 op = Iop_Xor64; break;
+
+      /* Introduced in SSE1 */
+      case 0xE0: op = Iop_Avg8Ux8;    break;
+      case 0xE3: op = Iop_Avg16Ux4;   break;
+      case 0xEE: op = Iop_Max16Sx4;   break;
+      case 0xDE: op = Iop_Max8Ux8;    break;
+      case 0xEA: op = Iop_Min16Sx4;   break;
+      case 0xDA: op = Iop_Min8Ux8;    break;
+      case 0xE4: op = Iop_MulHi16Ux4; break;
+      case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
+
+      /* Introduced in SSE2 */
+      case 0xD4: op = Iop_Add64; break;
+      case 0xFB: op = Iop_Sub64; break;
+
+      default: 
+         vex_printf("\n0x%x\n", (Int)opc);
+         vpanic("dis_MMXop_regmem_to_reg");
+   }
+
+#  undef XXX
+
+   argG = getMMXReg(gregOfRM(modrm));
+   if (invG)
+      argG = unop(Iop_Not64, argG);
+
+   if (isReg) {
+      delta++;
+      argE = getMMXReg(eregOfRM(modrm));
+   } else {
+      Int    len;
+      IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+      delta += len;
+      argE = loadLE(Ity_I64, mkexpr(addr));
+   }
+
+   if (eLeft) {
+      argL = argE;
+      argR = argG;
+   } else {
+      argL = argG;
+      argR = argE;
+   }
+
+   if (op != Iop_INVALID) {
+      vassert(hName == NULL);
+      vassert(hAddr == NULL);
+      assign(res, binop(op, argL, argR));
+   } else {
+      vassert(hName != NULL);
+      vassert(hAddr != NULL);
+      assign( res, 
+              mkIRExprCCall(
+                 Ity_I64, 
+                 0/*regparms*/, hName, hAddr,
+                 mkIRExprVec_2( argL, argR )
+              ) 
+            );
+   }
+
+   putMMXReg( gregOfRM(modrm), mkexpr(res) );
+
+   DIP("%s%s %s, %s\n", 
+       name, show_granularity ? nameMMXGran(opc & 3) : "",
+       ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
+       nameMMXReg(gregOfRM(modrm)) );
+
+   return delta;
+}
+
+
+/* Vector by scalar shift of G by the amount specified at the bottom
+   of E.  This is a straight copy of dis_SSE_shiftG_byE. */
+
+static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta, 
+                                 HChar* opname, IROp op )
+{
+   HChar   dis_buf[50];
+   Int     alen, size;
+   IRTemp  addr;
+   Bool    shl, shr, sar;
+   UChar   rm   = getIByte(delta);
+   IRTemp  g0   = newTemp(Ity_I64);
+   IRTemp  g1   = newTemp(Ity_I64);
+   IRTemp  amt  = newTemp(Ity_I32);
+   IRTemp  amt8 = newTemp(Ity_I8);
+
+   if (epartIsReg(rm)) {
+      assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
+      DIP("%s %s,%s\n", opname,
+                        nameMMXReg(eregOfRM(rm)),
+                        nameMMXReg(gregOfRM(rm)) );
+      delta++;
+   } else {
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameMMXReg(gregOfRM(rm)) );
+      delta += alen;
+   }
+   assign( g0,   getMMXReg(gregOfRM(rm)) );
+   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
+
+   shl = shr = sar = False;
+   size = 0;
+   switch (op) {
+      case Iop_ShlN16x4: shl = True; size = 32; break;
+      case Iop_ShlN32x2: shl = True; size = 32; break;
+      case Iop_Shl64:    shl = True; size = 64; break;
+      case Iop_ShrN16x4: shr = True; size = 16; break;
+      case Iop_ShrN32x2: shr = True; size = 32; break;
+      case Iop_Shr64:    shr = True; size = 64; break;
+      case Iop_SarN16x4: sar = True; size = 16; break;
+      case Iop_SarN32x2: sar = True; size = 32; break;
+      default: vassert(0);
+   }
+
+   if (shl || shr) {
+     assign( 
+        g1,
+        IRExpr_Mux0X(
+           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
+           mkU64(0),
+           binop(op, mkexpr(g0), mkexpr(amt8))
+        )
+     );
+   } else 
+   if (sar) {
+     assign( 
+        g1,
+        IRExpr_Mux0X(
+           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
+           binop(op, mkexpr(g0), mkU8(size-1)),
+           binop(op, mkexpr(g0), mkexpr(amt8))
+        )
+     );
+   } else {
+      /*NOTREACHED*/
+      vassert(0);
+   }
+
+   putMMXReg( gregOfRM(rm), mkexpr(g1) );
+   return delta;
+}
+
+
+/* Vector by scalar shift of E by an immediate byte.  This is a
+   straight copy of dis_SSE_shiftE_imm. */
+
+static 
+UInt dis_MMX_shiftE_imm ( Int delta, HChar* opname, IROp op )
+{
+   Bool    shl, shr, sar;
+   UChar   rm   = getIByte(delta);
+   IRTemp  e0   = newTemp(Ity_I64);
+   IRTemp  e1   = newTemp(Ity_I64);
+   UChar   amt, size;
+   vassert(epartIsReg(rm));
+   vassert(gregOfRM(rm) == 2 
+           || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
+   amt = getIByte(delta+1);
+   delta += 2;
+   DIP("%s $%d,%s\n", opname,
+                      (Int)amt,
+                      nameMMXReg(eregOfRM(rm)) );
+
+   assign( e0, getMMXReg(eregOfRM(rm)) );
+
+   shl = shr = sar = False;
+   size = 0;
+   switch (op) {
+      case Iop_ShlN16x4: shl = True; size = 16; break;
+      case Iop_ShlN32x2: shl = True; size = 32; break;
+      case Iop_Shl64:    shl = True; size = 64; break;
+      case Iop_SarN16x4: sar = True; size = 16; break;
+      case Iop_SarN32x2: sar = True; size = 32; break;
+      case Iop_ShrN16x4: shr = True; size = 16; break;
+      case Iop_ShrN32x2: shr = True; size = 32; break;
+      case Iop_Shr64:    shr = True; size = 64; break;
+      default: vassert(0);
+   }
+
+   if (shl || shr) {
+      assign( e1, amt >= size 
+                     ? mkU64(0)
+                     : binop(op, mkexpr(e0), mkU8(amt))
+      );
+   } else 
+   if (sar) {
+      assign( e1, amt >= size 
+                     ? binop(op, mkexpr(e0), mkU8(size-1))
+                     : binop(op, mkexpr(e0), mkU8(amt))
+      );
+   } else {
+      /*NOTREACHED*/
+      vassert(0);
+   }
+
+   putMMXReg( eregOfRM(rm), mkexpr(e1) );
+   return delta;
+}
+
+
+/* Completely handle all MMX instructions except emms. */
+
+static
+UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
+{
+   Int   len;
+   UChar modrm;
+   HChar dis_buf[50];
+   UChar opc = getIByte(delta);
+   delta++;
+
+   /* dis_MMX handles all insns except emms. */
+   do_MMX_preamble();
+
+   switch (opc) {
+
+      case 0x6E: 
+         /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         modrm = getIByte(delta);
+         if (epartIsReg(modrm)) {
+            delta++;
+            putMMXReg(
+               gregOfRM(modrm),
+               binop( Iop_32HLto64,
+                      mkU32(0),
+                      getIReg(4, eregOfRM(modrm)) ) );
+            DIP("movd %s, %s\n", 
+                nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
+         } else {
+            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+            delta += len;
+            putMMXReg(
+               gregOfRM(modrm),
+               binop( Iop_32HLto64,
+                      mkU32(0),
+                      loadLE(Ity_I32, mkexpr(addr)) ) );
+            DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
+         }
+         break;
+
+      case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         modrm = getIByte(delta);
+         if (epartIsReg(modrm)) {
+            delta++;
+            putIReg( 4, eregOfRM(modrm),
+                     unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
+            DIP("movd %s, %s\n", 
+                nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
+         } else {
+            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+            delta += len;
+            storeLE( mkexpr(addr),
+                     unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
+            DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
+         }
+         break;
+
+      case 0x6F:
+         /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         modrm = getIByte(delta);
+         if (epartIsReg(modrm)) {
+            delta++;
+            putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
+            DIP("movq %s, %s\n", 
+                nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
+         } else {
+            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+            delta += len;
+            putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
+            DIP("movq %s, %s\n", 
+                dis_buf, nameMMXReg(gregOfRM(modrm)));
+         }
+         break;
+
+      case 0x7F:
+         /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         modrm = getIByte(delta);
+         if (epartIsReg(modrm)) {
+            delta++;
+            putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
+            DIP("movq %s, %s\n", 
+                nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
+         } else {
+            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+            delta += len;
+            storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
+            DIP("mov(nt)q %s, %s\n", 
+                nameMMXReg(gregOfRM(modrm)), dis_buf);
+         }
+         break;
+
+      case 0xFC: 
+      case 0xFD: 
+      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
+         break;
+
+      case 0xEC: 
+      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
+         break;
+
+      case 0xDC: 
+      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
+         break;
+
+      case 0xF8: 
+      case 0xF9: 
+      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
+         break;
+
+      case 0xE8: 
+      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
+         break;
+
+      case 0xD8: 
+      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
+         break;
+
+      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
+         break;
+
+      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
+         break;
+
+      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
+         vassert(sz == 4);
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
+         break;
+
+      case 0x74: 
+      case 0x75: 
+      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
+         break;
+
+      case 0x64: 
+      case 0x65: 
+      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
+         break;
+
+      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
+         break;
+
+      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
+         break;
+
+      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
+         break;
+
+      case 0x68: 
+      case 0x69: 
+      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
+         break;
+
+      case 0x60: 
+      case 0x61: 
+      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
+         break;
+
+      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
+         break;
+
+      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
+         break;
+
+      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
+         break;
+
+      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
+         break; 
+
+#     define SHIFT_BY_REG(_name,_op)                                 \
+                delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
+                break;
+
+      /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
+      case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
+      case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
+
+      /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
+      case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
+      case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
+
+      /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
+      case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
+
+#     undef SHIFT_BY_REG
+
+      case 0x71: 
+      case 0x72: 
+      case 0x73: {
+         /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
+         UChar byte2, subopc;
+         if (sz != 4) 
+            goto mmx_decode_failure;
+         byte2  = getIByte(delta);           /* amode / sub-opcode */
+         subopc = toUChar( (byte2 >> 3) & 7 );
+
+#        define SHIFT_BY_IMM(_name,_op)                         \
+             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
+             } while (0)
+
+              if (subopc == 2 /*SRL*/ && opc == 0x71) 
+                 SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
+         else if (subopc == 2 /*SRL*/ && opc == 0x72) 
+                 SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
+         else if (subopc == 2 /*SRL*/ && opc == 0x73) 
+                 SHIFT_BY_IMM("psrlq", Iop_Shr64);
+
+         else if (subopc == 4 /*SAR*/ && opc == 0x71) 
+                 SHIFT_BY_IMM("psraw", Iop_SarN16x4);
+         else if (subopc == 4 /*SAR*/ && opc == 0x72) 
+                 SHIFT_BY_IMM("psrad", Iop_SarN32x2);
+
+         else if (subopc == 6 /*SHL*/ && opc == 0x71) 
+                 SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
+         else if (subopc == 6 /*SHL*/ && opc == 0x72) 
+                 SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
+         else if (subopc == 6 /*SHL*/ && opc == 0x73) 
+                 SHIFT_BY_IMM("psllq", Iop_Shl64);
+
+         else goto mmx_decode_failure;
+
+#        undef SHIFT_BY_IMM
+         break;
+      }
+
+      case 0xF7: {
+         IRTemp addr    = newTemp(Ity_I32);
+         IRTemp regD    = newTemp(Ity_I64);
+         IRTemp regM    = newTemp(Ity_I64);
+         IRTemp mask    = newTemp(Ity_I64);
+         IRTemp olddata = newTemp(Ity_I64);
+         IRTemp newdata = newTemp(Ity_I64);
+
+         modrm = getIByte(delta);
+         if (sz != 4 || (!epartIsReg(modrm)))
+            goto mmx_decode_failure;
+         delta++;
+
+         assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
+         assign( regM, getMMXReg( eregOfRM(modrm) ));
+         assign( regD, getMMXReg( gregOfRM(modrm) ));
+         assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
+         assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
+         assign( newdata, 
+                 binop(Iop_Or64, 
+                       binop(Iop_And64, 
+                             mkexpr(regD), 
+                             mkexpr(mask) ),
+                       binop(Iop_And64, 
+                             mkexpr(olddata),
+                             unop(Iop_Not64, mkexpr(mask)))) );
+         storeLE( mkexpr(addr), mkexpr(newdata) );
+         DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
+                                 nameMMXReg( gregOfRM(modrm) ) );
+         break;
+      }
+
+      /* --- MMX decode failure --- */
+      default:
+      mmx_decode_failure:
+         *decode_ok = False;
+         return delta; /* ignored */
+
+   }
+
+   *decode_ok = True;
+   return delta;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- More misc arithmetic and other obscure insns.        ---*/
+/*------------------------------------------------------------*/
+
+/* Double length left and right shifts.  Apparently only required in
+   v-size (no b- variant). */
+static
+UInt dis_SHLRD_Gv_Ev ( UChar sorb,
+                       Int delta, UChar modrm,
+                       Int sz,
+                       IRExpr* shift_amt,
+                       Bool amt_is_literal,
+                       HChar* shift_amt_txt,
+                       Bool left_shift )
+{
+   /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
+      for printing it.   And eip on entry points at the modrm byte. */
+   Int len;
+   HChar dis_buf[50];
+
+   IRType ty       = szToITy(sz);
+   IRTemp gsrc     = newTemp(ty);
+   IRTemp esrc     = newTemp(ty);
+   IRTemp addr     = IRTemp_INVALID;
+   IRTemp tmpSH    = newTemp(Ity_I8);
+   IRTemp tmpL     = IRTemp_INVALID;
+   IRTemp tmpRes   = IRTemp_INVALID;
+   IRTemp tmpSubSh = IRTemp_INVALID;
+   IROp   mkpair;
+   IROp   getres;
+   IROp   shift;
+   IRExpr* mask = NULL;
+
+   vassert(sz == 2 || sz == 4);
+
+   /* The E-part is the destination; this is shifted.  The G-part
+      supplies bits to be shifted into the E-part, but is not
+      changed.  
+
+      If shifting left, form a double-length word with E at the top
+      and G at the bottom, and shift this left.  The result is then in
+      the high part.
+
+      If shifting right, form a double-length word with G at the top
+      and E at the bottom, and shift this right.  The result is then
+      at the bottom.  */
+
+   /* Fetch the operands. */
+
+   assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
+
+   if (epartIsReg(modrm)) {
+      delta++;
+      assign( esrc, getIReg(sz, eregOfRM(modrm)) );
+      DIP("sh%cd%c %s, %s, %s\n",
+          ( left_shift ? 'l' : 'r' ), nameISize(sz), 
+          shift_amt_txt,
+          nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
+   } else {
+      addr = disAMode ( &len, sorb, delta, dis_buf );
+      delta += len;
+      assign( esrc, loadLE(ty, mkexpr(addr)) );
+      DIP("sh%cd%c %s, %s, %s\n", 
+          ( left_shift ? 'l' : 'r' ), nameISize(sz), 
+          shift_amt_txt,
+          nameIReg(sz, gregOfRM(modrm)), dis_buf);
+   }
+
+   /* Round up the relevant primops. */
+
+   if (sz == 4) {
+      tmpL     = newTemp(Ity_I64);
+      tmpRes   = newTemp(Ity_I32);
+      tmpSubSh = newTemp(Ity_I32);
+      mkpair   = Iop_32HLto64;
+      getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
+      shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
+      mask     = mkU8(31);
+   } else {
+      /* sz == 2 */
+      tmpL     = newTemp(Ity_I32);
+      tmpRes   = newTemp(Ity_I16);
+      tmpSubSh = newTemp(Ity_I16);
+      mkpair   = Iop_16HLto32;
+      getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
+      shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
+      mask     = mkU8(15);
+   }
+
+   /* Do the shift, calculate the subshift value, and set 
+      the flag thunk. */
+
+   assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
+
+   if (left_shift)
+      assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
+   else
+      assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
+
+   assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
+   assign( tmpSubSh, 
+           unop(getres, 
+                binop(shift, 
+                      mkexpr(tmpL), 
+                      binop(Iop_And8, 
+                            binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
+                            mask))) );
+
+   setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
+                              tmpRes, tmpSubSh, ty, tmpSH );
+
+   /* Put result back. */
+
+   if (epartIsReg(modrm)) {
+      putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
+   } else {
+      storeLE( mkexpr(addr), mkexpr(tmpRes) );
+   }
+
+   if (amt_is_literal) delta++;
+   return delta;
+}
+
+
+/* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
+   required. */
+
+typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
+
+static HChar* nameBtOp ( BtOp op )
+{
+   switch (op) {
+      case BtOpNone:  return "";
+      case BtOpSet:   return "s";
+      case BtOpReset: return "r";
+      case BtOpComp:  return "c";
+      default: vpanic("nameBtOp(x86)");
+   }
+}
+
+
+static
+UInt dis_bt_G_E ( VexAbiInfo* vbi,
+                  UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
+{
+   HChar  dis_buf[50];
+   UChar  modrm;
+   Int    len;
+   IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0, 
+          t_addr1, t_esp, t_mask, t_new;
+
+   vassert(sz == 2 || sz == 4);
+
+   t_fetched = t_bitno0 = t_bitno1 = t_bitno2 
+             = t_addr0 = t_addr1 = t_esp 
+             = t_mask = t_new = IRTemp_INVALID;
+
+   t_fetched = newTemp(Ity_I8);
+   t_new     = newTemp(Ity_I8);
+   t_bitno0  = newTemp(Ity_I32);
+   t_bitno1  = newTemp(Ity_I32);
+   t_bitno2  = newTemp(Ity_I8);
+   t_addr1   = newTemp(Ity_I32);
+   modrm     = getIByte(delta);
+
+   assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
+   
+   if (epartIsReg(modrm)) {
+      delta++;
+      /* Get it onto the client's stack. */
+      t_esp = newTemp(Ity_I32);
+      t_addr0 = newTemp(Ity_I32);
+
+      /* For the choice of the value 128, see comment in dis_bt_G_E in
+         guest_amd64_toIR.c.  We point out here only that 128 is
+         fast-cased in Memcheck and is > 0, so seems like a good
+         choice. */
+      vassert(vbi->guest_stack_redzone_size == 0);
+      assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
+      putIReg(4, R_ESP, mkexpr(t_esp));
+
+      storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
+
+      /* Make t_addr0 point at it. */
+      assign( t_addr0, mkexpr(t_esp) );
+
+      /* Mask out upper bits of the shift amount, since we're doing a
+         reg. */
+      assign( t_bitno1, binop(Iop_And32, 
+                              mkexpr(t_bitno0), 
+                              mkU32(sz == 4 ? 31 : 15)) );
+
+   } else {
+      t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
+      delta += len;
+      assign( t_bitno1, mkexpr(t_bitno0) );
+   }
+  
+   /* At this point: t_addr0 is the address being operated on.  If it
+      was a reg, we will have pushed it onto the client's stack.
+      t_bitno1 is the bit number, suitably masked in the case of a
+      reg.  */
+  
+   /* Now the main sequence. */
+   assign( t_addr1, 
+           binop(Iop_Add32, 
+                 mkexpr(t_addr0), 
+                 binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
+
+   /* t_addr1 now holds effective address */
+
+   assign( t_bitno2, 
+           unop(Iop_32to8, 
+                binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
+
+   /* t_bitno2 contains offset of bit within byte */
+
+   if (op != BtOpNone) {
+      t_mask = newTemp(Ity_I8);
+      assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
+   }
+
+   /* t_mask is now a suitable byte mask */
+
+   assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
+
+   if (op != BtOpNone) {
+      switch (op) {
+         case BtOpSet:
+            assign( t_new,
+                    binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
+            break;
+         case BtOpComp:
+            assign( t_new,
+                    binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
+            break;
+         case BtOpReset:
+            assign( t_new,
+                    binop(Iop_And8, mkexpr(t_fetched), 
+                                    unop(Iop_Not8, mkexpr(t_mask))) );
+            break;
+         default: 
+            vpanic("dis_bt_G_E(x86)");
+      }
+      if (locked && !epartIsReg(modrm)) {
+         casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
+                                 mkexpr(t_new)/*new*/,
+                                 guest_EIP_curr_instr );
+      } else {
+         storeLE( mkexpr(t_addr1), mkexpr(t_new) );
+      }
+   }
+ 
+   /* Side effect done; now get selected bit into Carry flag */
+   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+   stmt( IRStmt_Put( 
+            OFFB_CC_DEP1,
+            binop(Iop_And32,
+                  binop(Iop_Shr32, 
+                        unop(Iop_8Uto32, mkexpr(t_fetched)),
+                        mkexpr(t_bitno2)),
+                  mkU32(1)))
+       );
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+
+   /* Move reg operand from stack back to reg */
+   if (epartIsReg(modrm)) {
+      /* t_esp still points at it. */
+      putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
+      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
+   }
+
+   DIP("bt%s%c %s, %s\n",
+       nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)), 
+       ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
+ 
+   return delta;
+}
+
+
+
+/* Handle BSF/BSR.  Only v-size seems necessary. */
+static
+UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
+{
+   Bool   isReg;
+   UChar  modrm;
+   HChar  dis_buf[50];
+   
+   IRType ty  = szToITy(sz);
+   IRTemp src = newTemp(ty);
+   IRTemp dst = newTemp(ty);
+
+   IRTemp src32 = newTemp(Ity_I32);
+   IRTemp dst32 = newTemp(Ity_I32);
+   IRTemp src8  = newTemp(Ity_I8);
+
+   vassert(sz == 4 || sz == 2);
+
+   modrm = getIByte(delta);
+
+   isReg = epartIsReg(modrm);
+   if (isReg) {
+      delta++;
+      assign( src, getIReg(sz, eregOfRM(modrm)) );
+   } else {
+      Int    len;
+      IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
+      delta += len;
+      assign( src, loadLE(ty, mkexpr(addr)) );
+   }
+
+   DIP("bs%c%c %s, %s\n",
+       fwds ? 'f' : 'r', nameISize(sz), 
+       ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ), 
+       nameIReg(sz, gregOfRM(modrm)));
+
+   /* Generate an 8-bit expression which is zero iff the 
+      original is zero, and nonzero otherwise */
+   assign( src8,
+           unop(Iop_1Uto8, binop(mkSizedOp(ty,Iop_CmpNE8),
+                           mkexpr(src), mkU(ty,0))) );
+
+   /* Flags: Z is 1 iff source value is zero.  All others 
+      are undefined -- we force them to zero. */
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+   stmt( IRStmt_Put( 
+            OFFB_CC_DEP1,
+            IRExpr_Mux0X( mkexpr(src8),
+                          /* src==0 */
+                          mkU32(X86G_CC_MASK_Z),
+                          /* src!=0 */
+                          mkU32(0)
+                        )
+       ));
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+
+   /* Result: iff source value is zero, we can't use
+      Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
+      But anyway, Intel x86 semantics say the result is undefined in
+      such situations.  Hence handle the zero case specially. */
+
+   /* Bleh.  What we compute:
+
+          bsf32:  if src == 0 then 0 else  Ctz32(src)
+          bsr32:  if src == 0 then 0 else  31 - Clz32(src)
+
+          bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
+          bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
+
+      First, widen src to 32 bits if it is not already.
+
+      Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
+      dst register unchanged when src == 0.  Hence change accordingly.
+   */
+   if (sz == 2)
+      assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
+   else
+      assign( src32, mkexpr(src) );
+
+   /* The main computation, guarding against zero. */
+   assign( dst32,   
+           IRExpr_Mux0X( 
+              mkexpr(src8),
+              /* src == 0 -- leave dst unchanged */
+              widenUto32( getIReg( sz, gregOfRM(modrm) ) ),
+              /* src != 0 */
+              fwds ? unop(Iop_Ctz32, mkexpr(src32))
+                   : binop(Iop_Sub32, 
+                           mkU32(31), 
+                           unop(Iop_Clz32, mkexpr(src32)))
+           )
+         );
+
+   if (sz == 2)
+      assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
+   else
+      assign( dst, mkexpr(dst32) );
+
+   /* dump result back */
+   putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
+
+   return delta;
+}
+
+
+static 
+void codegen_xchg_eAX_Reg ( Int sz, Int reg )
+{
+   IRType ty = szToITy(sz);
+   IRTemp t1 = newTemp(ty);
+   IRTemp t2 = newTemp(ty);
+   vassert(sz == 2 || sz == 4);
+   assign( t1, getIReg(sz, R_EAX) );
+   assign( t2, getIReg(sz, reg) );
+   putIReg( sz, R_EAX, mkexpr(t2) );
+   putIReg( sz, reg, mkexpr(t1) );
+   DIP("xchg%c %s, %s\n", 
+       nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
+}
+
+
+static 
+void codegen_SAHF ( void )
+{
+   /* Set the flags to:
+      (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
+      | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
+                |X86G_CC_MASK_P|X86G_CC_MASK_C)
+   */
+   UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
+                       |X86G_CC_MASK_C|X86G_CC_MASK_P;
+   IRTemp oldflags   = newTemp(Ity_I32);
+   assign( oldflags, mk_x86g_calculate_eflags_all() );
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP1,
+         binop(Iop_Or32,
+               binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
+               binop(Iop_And32, 
+                     binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
+                     mkU32(mask_SZACP))
+              )
+   ));
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+}
+
+
+static 
+void codegen_LAHF ( void  )
+{
+   /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
+   IRExpr* eax_with_hole;
+   IRExpr* new_byte;
+   IRExpr* new_eax;
+   UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
+                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
+
+   IRTemp  flags = newTemp(Ity_I32);
+   assign( flags, mk_x86g_calculate_eflags_all() );
+
+   eax_with_hole 
+      = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
+   new_byte 
+      = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
+                        mkU32(1<<1));
+   new_eax 
+      = binop(Iop_Or32, eax_with_hole,
+                        binop(Iop_Shl32, new_byte, mkU8(8)));
+   putIReg(4, R_EAX, new_eax);
+}
+
+
+static
+UInt dis_cmpxchg_G_E ( UChar       sorb,
+                       Bool        locked,
+                       Int         size, 
+                       Int         delta0 )
+{
+   HChar dis_buf[50];
+   Int   len;
+
+   IRType ty    = szToITy(size);
+   IRTemp acc   = newTemp(ty);
+   IRTemp src   = newTemp(ty);
+   IRTemp dest  = newTemp(ty);
+   IRTemp dest2 = newTemp(ty);
+   IRTemp acc2  = newTemp(ty);
+   IRTemp cond8 = newTemp(Ity_I8);
+   IRTemp addr  = IRTemp_INVALID;
+   UChar  rm    = getUChar(delta0);
+
+   /* There are 3 cases to consider:
+
+      reg-reg: ignore any lock prefix, generate sequence based
+               on Mux0X
+
+      reg-mem, not locked: ignore any lock prefix, generate sequence
+                           based on Mux0X
+
+      reg-mem, locked: use IRCAS
+   */
+   if (epartIsReg(rm)) {
+      /* case 1 */
+      assign( dest, getIReg(size, eregOfRM(rm)) );
+      delta0++;
+      assign( src, getIReg(size, gregOfRM(rm)) );
+      assign( acc, getIReg(size, R_EAX) );
+      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
+      assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
+      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
+      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
+      putIReg(size, R_EAX, mkexpr(acc2));
+      putIReg(size, eregOfRM(rm), mkexpr(dest2));
+      DIP("cmpxchg%c %s,%s\n", nameISize(size),
+                               nameIReg(size,gregOfRM(rm)),
+                               nameIReg(size,eregOfRM(rm)) );
+   } 
+   else if (!epartIsReg(rm) && !locked) {
+      /* case 2 */
+      addr = disAMode ( &len, sorb, delta0, dis_buf );
+      assign( dest, loadLE(ty, mkexpr(addr)) );
+      delta0 += len;
+      assign( src, getIReg(size, gregOfRM(rm)) );
+      assign( acc, getIReg(size, R_EAX) );
+      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
+      assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
+      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
+      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
+      putIReg(size, R_EAX, mkexpr(acc2));
+      storeLE( mkexpr(addr), mkexpr(dest2) );
+      DIP("cmpxchg%c %s,%s\n", nameISize(size), 
+                               nameIReg(size,gregOfRM(rm)), dis_buf);
+   }
+   else if (!epartIsReg(rm) && locked) {
+      /* case 3 */
+      /* src is new value.  acc is expected value.  dest is old value.
+         Compute success from the output of the IRCAS, and steer the
+         new value for EAX accordingly: in case of success, EAX is
+         unchanged. */
+      addr = disAMode ( &len, sorb, delta0, dis_buf );
+      delta0 += len;
+      assign( src, getIReg(size, gregOfRM(rm)) );
+      assign( acc, getIReg(size, R_EAX) );
+      stmt( IRStmt_CAS( 
+         mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr), 
+                  NULL, mkexpr(acc), NULL, mkexpr(src) )
+      ));
+      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
+      assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
+      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
+      putIReg(size, R_EAX, mkexpr(acc2));
+      DIP("cmpxchg%c %s,%s\n", nameISize(size), 
+                               nameIReg(size,gregOfRM(rm)), dis_buf);
+   }
+   else vassert(0);
+
+   return delta0;
+}
+
+
+/* Handle conditional move instructions of the form
+      cmovcc E(reg-or-mem), G(reg)
+
+   E(src) is reg-or-mem
+   G(dst) is reg.
+
+   If E is reg, -->    GET %E, tmps
+                       GET %G, tmpd
+                       CMOVcc tmps, tmpd
+                       PUT tmpd, %G
+ 
+   If E is mem  -->    (getAddr E) -> tmpa
+                       LD (tmpa), tmps
+                       GET %G, tmpd
+                       CMOVcc tmps, tmpd
+                       PUT tmpd, %G
+*/
+static
+UInt dis_cmov_E_G ( UChar       sorb,
+                    Int         sz, 
+                    X86Condcode cond,
+                    Int         delta0 )
+{
+   UChar rm  = getIByte(delta0);
+   HChar dis_buf[50];
+   Int   len;
+
+   IRType ty   = szToITy(sz);
+   IRTemp tmps = newTemp(ty);
+   IRTemp tmpd = newTemp(ty);
+
+   if (epartIsReg(rm)) {
+      assign( tmps, getIReg(sz, eregOfRM(rm)) );
+      assign( tmpd, getIReg(sz, gregOfRM(rm)) );
+
+      putIReg(sz, gregOfRM(rm),
+                  IRExpr_Mux0X( unop(Iop_1Uto8,
+                                     mk_x86g_calculate_condition(cond)),
+                                mkexpr(tmpd),
+                                mkexpr(tmps) )
+             );
+      DIP("cmov%c%s %s,%s\n", nameISize(sz), 
+                              name_X86Condcode(cond),
+                              nameIReg(sz,eregOfRM(rm)),
+                              nameIReg(sz,gregOfRM(rm)));
+      return 1+delta0;
+   }
+
+   /* E refers to memory */    
+   {
+      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
+      assign( tmps, loadLE(ty, mkexpr(addr)) );
+      assign( tmpd, getIReg(sz, gregOfRM(rm)) );
+
+      putIReg(sz, gregOfRM(rm),
+                  IRExpr_Mux0X( unop(Iop_1Uto8,
+                                     mk_x86g_calculate_condition(cond)),
+                                mkexpr(tmpd),
+                                mkexpr(tmps) )
+             );
+
+      DIP("cmov%c%s %s,%s\n", nameISize(sz), 
+                              name_X86Condcode(cond),
+                              dis_buf,
+                              nameIReg(sz,gregOfRM(rm)));
+      return len+delta0;
+   }
+}
+
+
+static
+UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
+                    Bool* decodeOK )
+{
+   Int   len;
+   UChar rm = getIByte(delta0);
+   HChar dis_buf[50];
+
+   IRType ty    = szToITy(sz);
+   IRTemp tmpd  = newTemp(ty);
+   IRTemp tmpt0 = newTemp(ty);
+   IRTemp tmpt1 = newTemp(ty);
+
+   /* There are 3 cases to consider:
+
+      reg-reg: ignore any lock prefix,
+               generate 'naive' (non-atomic) sequence
+
+      reg-mem, not locked: ignore any lock prefix, generate 'naive'
+                           (non-atomic) sequence
+
+      reg-mem, locked: use IRCAS
+   */
+
+   if (epartIsReg(rm)) {
+      /* case 1 */
+      assign( tmpd,  getIReg(sz, eregOfRM(rm)));
+      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
+      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
+                           mkexpr(tmpd), mkexpr(tmpt0)) );
+      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
+      putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
+      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
+      DIP("xadd%c %s, %s\n",
+          nameISize(sz), nameIReg(sz,gregOfRM(rm)), 
+          				 nameIReg(sz,eregOfRM(rm)));
+      *decodeOK = True;
+      return 1+delta0;
+   }
+   else if (!epartIsReg(rm) && !locked) {
+      /* case 2 */
+      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
+      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
+      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
+      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
+                           mkexpr(tmpd), mkexpr(tmpt0)) );
+      storeLE( mkexpr(addr), mkexpr(tmpt1) );
+      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
+      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
+      DIP("xadd%c %s, %s\n",
+          nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
+      *decodeOK = True;
+      return len+delta0;
+   }
+   else if (!epartIsReg(rm) && locked) {
+      /* case 3 */
+      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
+      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
+      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
+      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8), 
+                           mkexpr(tmpd), mkexpr(tmpt0)) );
+      casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
+                           mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
+      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
+      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
+      DIP("xadd%c %s, %s\n",
+          nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
+      *decodeOK = True;
+      return len+delta0;
+   }
+   /*UNREACHED*/
+   vassert(0);
+}
+
+/* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
+
+static
+UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
+{
+   Int    len;
+   IRTemp addr;
+   UChar  rm  = getIByte(delta0);
+   HChar  dis_buf[50];
+
+   if (epartIsReg(rm)) {
+      putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
+      DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
+      return 1+delta0;
+   } else {
+      addr = disAMode ( &len, sorb, delta0, dis_buf );
+      putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
+      DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
+      return len+delta0;
+   }
+}
+
+/* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
+   dst is ireg and sz==4, zero out top half of it.  */
+
+static
+UInt dis_mov_Sw_Ew ( UChar sorb,
+                     Int   sz,
+                     Int   delta0 )
+{
+   Int    len;
+   IRTemp addr;
+   UChar  rm  = getIByte(delta0);
+   HChar  dis_buf[50];
+
+   vassert(sz == 2 || sz == 4);
+
+   if (epartIsReg(rm)) {
+      if (sz == 4)
+         putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
+      else
+         putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
+
+      DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
+      return 1+delta0;
+   } else {
+      addr = disAMode ( &len, sorb, delta0, dis_buf );
+      storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
+      DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
+      return len+delta0;
+   }
+}
+
+
+static 
+void dis_push_segreg ( UInt sreg, Int sz )
+{
+    IRTemp t1 = newTemp(Ity_I16);
+    IRTemp ta = newTemp(Ity_I32);
+    vassert(sz == 2 || sz == 4);
+
+    assign( t1, getSReg(sreg) );
+    assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
+    putIReg(4, R_ESP, mkexpr(ta));
+    storeLE( mkexpr(ta), mkexpr(t1) );
+
+    DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
+}
+
+static
+void dis_pop_segreg ( UInt sreg, Int sz )
+{
+    IRTemp t1 = newTemp(Ity_I16);
+    IRTemp ta = newTemp(Ity_I32);
+    vassert(sz == 2 || sz == 4);
+
+    assign( ta, getIReg(4, R_ESP) );
+    assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
+
+    putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
+    putSReg( sreg, mkexpr(t1) );
+    DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
+}
+
+static
+void dis_ret ( UInt d32 )
+{
+   IRTemp t1 = newTemp(Ity_I32), t2 = newTemp(Ity_I32);
+   assign(t1, getIReg(4,R_ESP));
+   assign(t2, loadLE(Ity_I32,mkexpr(t1)));
+   putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
+   jmp_treg(Ijk_Ret,t2);
+}
+
+/*------------------------------------------------------------*/
+/*--- SSE/SSE2/SSE3 helpers                                ---*/
+/*------------------------------------------------------------*/
+
+/* Worker function; do not call directly. 
+   Handles full width G = G `op` E   and   G = (not G) `op` E.
+*/
+
+static UInt dis_SSE_E_to_G_all_wrk ( 
+               UChar sorb, Int delta, 
+               HChar* opname, IROp op,
+               Bool   invertG
+            )
+{
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getIByte(delta);
+   IRExpr* gpart
+      = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
+                : getXMMReg(gregOfRM(rm));
+   if (epartIsReg(rm)) {
+      putXMMReg( gregOfRM(rm), 
+                 binop(op, gpart,
+                           getXMMReg(eregOfRM(rm))) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRM(rm)),
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+1;
+   } else {
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      putXMMReg( gregOfRM(rm), 
+                 binop(op, gpart,
+                           loadLE(Ity_V128, mkexpr(addr))) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+alen;
+   }
+}
+
+
+/* All lanes SSE binary operation, G = G `op` E. */
+
+static
+UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, HChar* opname, IROp op )
+{
+   return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
+}
+
+/* All lanes SSE binary operation, G = (not G) `op` E. */
+
+static
+UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta, 
+                               HChar* opname, IROp op )
+{
+   return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
+}
+
+
+/* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
+
+static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta, 
+                                  HChar* opname, IROp op )
+{
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getIByte(delta);
+   IRExpr* gpart = getXMMReg(gregOfRM(rm));
+   if (epartIsReg(rm)) {
+      putXMMReg( gregOfRM(rm), 
+                 binop(op, gpart,
+                           getXMMReg(eregOfRM(rm))) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRM(rm)),
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+1;
+   } else {
+      /* We can only do a 32-bit memory read, so the upper 3/4 of the
+         E operand needs to be made simply of zeroes. */
+      IRTemp epart = newTemp(Ity_V128);
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      assign( epart, unop( Iop_32UtoV128,
+                           loadLE(Ity_I32, mkexpr(addr))) );
+      putXMMReg( gregOfRM(rm), 
+                 binop(op, gpart, mkexpr(epart)) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+alen;
+   }
+}
+
+
+/* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
+
+static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta, 
+                                  HChar* opname, IROp op )
+{
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getIByte(delta);
+   IRExpr* gpart = getXMMReg(gregOfRM(rm));
+   if (epartIsReg(rm)) {
+      putXMMReg( gregOfRM(rm), 
+                 binop(op, gpart,
+                           getXMMReg(eregOfRM(rm))) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRM(rm)),
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+1;
+   } else {
+      /* We can only do a 64-bit memory read, so the upper half of the
+         E operand needs to be made simply of zeroes. */
+      IRTemp epart = newTemp(Ity_V128);
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      assign( epart, unop( Iop_64UtoV128,
+                           loadLE(Ity_I64, mkexpr(addr))) );
+      putXMMReg( gregOfRM(rm), 
+                 binop(op, gpart, mkexpr(epart)) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+alen;
+   }
+}
+
+
+/* All lanes unary SSE operation, G = op(E). */
+
+static UInt dis_SSE_E_to_G_unary_all ( 
+               UChar sorb, Int delta, 
+               HChar* opname, IROp op
+            )
+{
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getIByte(delta);
+   if (epartIsReg(rm)) {
+      putXMMReg( gregOfRM(rm), 
+                 unop(op, getXMMReg(eregOfRM(rm))) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRM(rm)),
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+1;
+   } else {
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      putXMMReg( gregOfRM(rm), 
+                 unop(op, loadLE(Ity_V128, mkexpr(addr))) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+alen;
+   }
+}
+
+
+/* Lowest 32-bit lane only unary SSE operation, G = op(E). */
+
+static UInt dis_SSE_E_to_G_unary_lo32 ( 
+               UChar sorb, Int delta, 
+               HChar* opname, IROp op
+            )
+{
+   /* First we need to get the old G value and patch the low 32 bits
+      of the E operand into it.  Then apply op and write back to G. */
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getIByte(delta);
+   IRTemp  oldG0 = newTemp(Ity_V128);
+   IRTemp  oldG1 = newTemp(Ity_V128);
+
+   assign( oldG0, getXMMReg(gregOfRM(rm)) );
+
+   if (epartIsReg(rm)) {
+      assign( oldG1, 
+              binop( Iop_SetV128lo32,
+                     mkexpr(oldG0),
+                     getXMMRegLane32(eregOfRM(rm), 0)) );
+      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRM(rm)),
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+1;
+   } else {
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      assign( oldG1, 
+              binop( Iop_SetV128lo32,
+                     mkexpr(oldG0),
+                     loadLE(Ity_I32, mkexpr(addr)) ));
+      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+alen;
+   }
+}
+
+
+/* Lowest 64-bit lane only unary SSE operation, G = op(E). */
+
+static UInt dis_SSE_E_to_G_unary_lo64 ( 
+               UChar sorb, Int delta, 
+               HChar* opname, IROp op
+            )
+{
+   /* First we need to get the old G value and patch the low 64 bits
+      of the E operand into it.  Then apply op and write back to G. */
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getIByte(delta);
+   IRTemp  oldG0 = newTemp(Ity_V128);
+   IRTemp  oldG1 = newTemp(Ity_V128);
+
+   assign( oldG0, getXMMReg(gregOfRM(rm)) );
+
+   if (epartIsReg(rm)) {
+      assign( oldG1, 
+              binop( Iop_SetV128lo64,
+                     mkexpr(oldG0),
+                     getXMMRegLane64(eregOfRM(rm), 0)) );
+      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRM(rm)),
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+1;
+   } else {
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      assign( oldG1, 
+              binop( Iop_SetV128lo64,
+                     mkexpr(oldG0),
+                     loadLE(Ity_I64, mkexpr(addr)) ));
+      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRM(rm)) );
+      return delta+alen;
+   }
+}
+
+
+/* SSE integer binary operation:
+      G = G `op` E   (eLeft == False)
+      G = E `op` G   (eLeft == True)
+*/
+static UInt dis_SSEint_E_to_G( 
+               UChar sorb, Int delta, 
+               HChar* opname, IROp op,
+               Bool   eLeft
+            )
+{
+   HChar   dis_buf[50];
+   Int     alen;
+   IRTemp  addr;
+   UChar   rm = getIByte(delta);
+   IRExpr* gpart = getXMMReg(gregOfRM(rm));
+   IRExpr* epart = NULL;
+   if (epartIsReg(rm)) {
+      epart = getXMMReg(eregOfRM(rm));
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRM(rm)),
+                        nameXMMReg(gregOfRM(rm)) );
+      delta += 1;
+   } else {
+      addr  = disAMode ( &alen, sorb, delta, dis_buf );
+      epart = loadLE(Ity_V128, mkexpr(addr));
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRM(rm)) );
+      delta += alen;
+   }
+   putXMMReg( gregOfRM(rm), 
+              eLeft ? binop(op, epart, gpart)
+	            : binop(op, gpart, epart) );
+   return delta;
+}
+
+
+/* Helper for doing SSE FP comparisons. */
+
+static void findSSECmpOp ( Bool* needNot, IROp* op, 
+                           Int imm8, Bool all_lanes, Int sz )
+{
+   imm8 &= 7;
+   *needNot = False;
+   *op      = Iop_INVALID;
+   if (imm8 >= 4) {
+      *needNot = True;
+      imm8 -= 4;
+   }
+
+   if (sz == 4 && all_lanes) {
+      switch (imm8) {
+         case 0: *op = Iop_CmpEQ32Fx4; return;
+         case 1: *op = Iop_CmpLT32Fx4; return;
+         case 2: *op = Iop_CmpLE32Fx4; return;
+         case 3: *op = Iop_CmpUN32Fx4; return;
+         default: break;
+      }
+   }
+   if (sz == 4 && !all_lanes) {
+      switch (imm8) {
+         case 0: *op = Iop_CmpEQ32F0x4; return;
+         case 1: *op = Iop_CmpLT32F0x4; return;
+         case 2: *op = Iop_CmpLE32F0x4; return;
+         case 3: *op = Iop_CmpUN32F0x4; return;
+         default: break;
+      }
+   }
+   if (sz == 8 && all_lanes) {
+      switch (imm8) {
+         case 0: *op = Iop_CmpEQ64Fx2; return;
+         case 1: *op = Iop_CmpLT64Fx2; return;
+         case 2: *op = Iop_CmpLE64Fx2; return;
+         case 3: *op = Iop_CmpUN64Fx2; return;
+         default: break;
+      }
+   }
+   if (sz == 8 && !all_lanes) {
+      switch (imm8) {
+         case 0: *op = Iop_CmpEQ64F0x2; return;
+         case 1: *op = Iop_CmpLT64F0x2; return;
+         case 2: *op = Iop_CmpLE64F0x2; return;
+         case 3: *op = Iop_CmpUN64F0x2; return;
+         default: break;
+      }
+   }
+   vpanic("findSSECmpOp(x86,guest)");
+}
+
+/* Handles SSE 32F/64F comparisons. */
+
+static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta, 
+				HChar* opname, Bool all_lanes, Int sz )
+{
+   HChar   dis_buf[50];
+   Int     alen, imm8;
+   IRTemp  addr;
+   Bool    needNot = False;
+   IROp    op      = Iop_INVALID;
+   IRTemp  plain   = newTemp(Ity_V128);
+   UChar   rm      = getIByte(delta);
+   UShort  mask    = 0;
+   vassert(sz == 4 || sz == 8);
+   if (epartIsReg(rm)) {
+      imm8 = getIByte(delta+1);
+      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
+      assign( plain, binop(op, getXMMReg(gregOfRM(rm)), 
+                               getXMMReg(eregOfRM(rm))) );
+      delta += 2;
+      DIP("%s $%d,%s,%s\n", opname,
+                            (Int)imm8,
+                            nameXMMReg(eregOfRM(rm)),
+                            nameXMMReg(gregOfRM(rm)) );
+   } else {
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      imm8 = getIByte(delta+alen);
+      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
+      assign( plain, 
+              binop(
+                 op,
+                 getXMMReg(gregOfRM(rm)), 
+                   all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
+                 : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
+                 : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
+             ) 
+      );
+      delta += alen+1;
+      DIP("%s $%d,%s,%s\n", opname,
+                            (Int)imm8,
+                            dis_buf,
+                            nameXMMReg(gregOfRM(rm)) );
+   }
+
+   if (needNot && all_lanes) {
+      putXMMReg( gregOfRM(rm), 
+                 unop(Iop_NotV128, mkexpr(plain)) );
+   }
+   else
+   if (needNot && !all_lanes) {
+      mask = toUShort( sz==4 ? 0x000F : 0x00FF );
+      putXMMReg( gregOfRM(rm), 
+                 binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
+   }
+   else {
+      putXMMReg( gregOfRM(rm), mkexpr(plain) );
+   }
+
+   return delta;
+}
+
+
+/* Vector by scalar shift of G by the amount specified at the bottom
+   of E. */
+
+static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta, 
+                                 HChar* opname, IROp op )
+{
+   HChar   dis_buf[50];
+   Int     alen, size;
+   IRTemp  addr;
+   Bool    shl, shr, sar;
+   UChar   rm   = getIByte(delta);
+   IRTemp  g0   = newTemp(Ity_V128);
+   IRTemp  g1   = newTemp(Ity_V128);
+   IRTemp  amt  = newTemp(Ity_I32);
+   IRTemp  amt8 = newTemp(Ity_I8);
+   if (epartIsReg(rm)) {
+      assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
+      DIP("%s %s,%s\n", opname,
+                        nameXMMReg(eregOfRM(rm)),
+                        nameXMMReg(gregOfRM(rm)) );
+      delta++;
+   } else {
+      addr = disAMode ( &alen, sorb, delta, dis_buf );
+      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
+      DIP("%s %s,%s\n", opname,
+                        dis_buf,
+                        nameXMMReg(gregOfRM(rm)) );
+      delta += alen;
+   }
+   assign( g0,   getXMMReg(gregOfRM(rm)) );
+   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
+
+   shl = shr = sar = False;
+   size = 0;
+   switch (op) {
+      case Iop_ShlN16x8: shl = True; size = 32; break;
+      case Iop_ShlN32x4: shl = True; size = 32; break;
+      case Iop_ShlN64x2: shl = True; size = 64; break;
+      case Iop_SarN16x8: sar = True; size = 16; break;
+      case Iop_SarN32x4: sar = True; size = 32; break;
+      case Iop_ShrN16x8: shr = True; size = 16; break;
+      case Iop_ShrN32x4: shr = True; size = 32; break;
+      case Iop_ShrN64x2: shr = True; size = 64; break;
+      default: vassert(0);
+   }
+
+   if (shl || shr) {
+     assign( 
+        g1,
+        IRExpr_Mux0X(
+           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
+           mkV128(0x0000),
+           binop(op, mkexpr(g0), mkexpr(amt8))
+        )
+     );
+   } else 
+   if (sar) {
+     assign( 
+        g1,
+        IRExpr_Mux0X(
+           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
+           binop(op, mkexpr(g0), mkU8(size-1)),
+           binop(op, mkexpr(g0), mkexpr(amt8))
+        )
+     );
+   } else {
+      /*NOTREACHED*/
+      vassert(0);
+   }
+
+   putXMMReg( gregOfRM(rm), mkexpr(g1) );
+   return delta;
+}
+
+
+/* Vector by scalar shift of E by an immediate byte. */
+
+static 
+UInt dis_SSE_shiftE_imm ( Int delta, HChar* opname, IROp op )
+{
+   Bool    shl, shr, sar;
+   UChar   rm   = getIByte(delta);
+   IRTemp  e0   = newTemp(Ity_V128);
+   IRTemp  e1   = newTemp(Ity_V128);
+   UChar   amt, size;
+   vassert(epartIsReg(rm));
+   vassert(gregOfRM(rm) == 2 
+           || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
+   amt = getIByte(delta+1);
+   delta += 2;
+   DIP("%s $%d,%s\n", opname,
+                      (Int)amt,
+                      nameXMMReg(eregOfRM(rm)) );
+   assign( e0, getXMMReg(eregOfRM(rm)) );
+
+   shl = shr = sar = False;
+   size = 0;
+   switch (op) {
+      case Iop_ShlN16x8: shl = True; size = 16; break;
+      case Iop_ShlN32x4: shl = True; size = 32; break;
+      case Iop_ShlN64x2: shl = True; size = 64; break;
+      case Iop_SarN16x8: sar = True; size = 16; break;
+      case Iop_SarN32x4: sar = True; size = 32; break;
+      case Iop_ShrN16x8: shr = True; size = 16; break;
+      case Iop_ShrN32x4: shr = True; size = 32; break;
+      case Iop_ShrN64x2: shr = True; size = 64; break;
+      default: vassert(0);
+   }
+
+   if (shl || shr) {
+      assign( e1, amt >= size 
+                     ? mkV128(0x0000)
+                     : binop(op, mkexpr(e0), mkU8(amt))
+      );
+   } else 
+   if (sar) {
+      assign( e1, amt >= size 
+                     ? binop(op, mkexpr(e0), mkU8(size-1))
+                     : binop(op, mkexpr(e0), mkU8(amt))
+      );
+   } else {
+      /*NOTREACHED*/
+      vassert(0);
+   }
+
+   putXMMReg( eregOfRM(rm), mkexpr(e1) );
+   return delta;
+}
+
+
+/* Get the current SSE rounding mode. */
+
+static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
+{
+   return binop( Iop_And32, 
+                 IRExpr_Get( OFFB_SSEROUND, Ity_I32 ), 
+                 mkU32(3) );
+}
+
+static void put_sse_roundingmode ( IRExpr* sseround )
+{
+   vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
+   stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
+}
+
+/* Break a 128-bit value up into four 32-bit ints. */
+
+static void breakup128to32s ( IRTemp t128,
+			      /*OUTs*/
+                              IRTemp* t3, IRTemp* t2,
+                              IRTemp* t1, IRTemp* t0 )
+{
+   IRTemp hi64 = newTemp(Ity_I64);
+   IRTemp lo64 = newTemp(Ity_I64);
+   assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
+   assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
+
+   vassert(t0 && *t0 == IRTemp_INVALID);
+   vassert(t1 && *t1 == IRTemp_INVALID);
+   vassert(t2 && *t2 == IRTemp_INVALID);
+   vassert(t3 && *t3 == IRTemp_INVALID);
+
+   *t0 = newTemp(Ity_I32);
+   *t1 = newTemp(Ity_I32);
+   *t2 = newTemp(Ity_I32);
+   *t3 = newTemp(Ity_I32);
+   assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
+   assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
+   assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
+   assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
+}
+
+/* Construct a 128-bit value from four 32-bit ints. */
+
+static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
+                              IRTemp t1, IRTemp t0 )
+{
+   return
+      binop( Iop_64HLtoV128,
+             binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
+             binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
+   );
+}
+
+/* Break a 64-bit value up into four 16-bit ints. */
+
+static void breakup64to16s ( IRTemp t64,
+                             /*OUTs*/
+                             IRTemp* t3, IRTemp* t2,
+                             IRTemp* t1, IRTemp* t0 )
+{
+   IRTemp hi32 = newTemp(Ity_I32);
+   IRTemp lo32 = newTemp(Ity_I32);
+   assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
+   assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
+
+   vassert(t0 && *t0 == IRTemp_INVALID);
+   vassert(t1 && *t1 == IRTemp_INVALID);
+   vassert(t2 && *t2 == IRTemp_INVALID);
+   vassert(t3 && *t3 == IRTemp_INVALID);
+
+   *t0 = newTemp(Ity_I16);
+   *t1 = newTemp(Ity_I16);
+   *t2 = newTemp(Ity_I16);
+   *t3 = newTemp(Ity_I16);
+   assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
+   assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
+   assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
+   assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
+}
+
+/* Construct a 64-bit value from four 16-bit ints. */
+
+static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
+                             IRTemp t1, IRTemp t0 )
+{
+   return
+      binop( Iop_32HLto64,
+             binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
+             binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
+   );
+}
+
+/* Generate IR to set the guest %EFLAGS from the pushfl-format image
+   in the given 32-bit temporary.  The flags that are set are: O S Z A
+   C P D ID AC.
+
+   In all cases, code to set AC is generated.  However, VEX actually
+   ignores the AC value and so can optionally emit an emulation
+   warning when it is enabled.  In this routine, an emulation warning
+   is only emitted if emit_AC_emwarn is True, in which case
+   next_insn_EIP must be correct (this allows for correct code
+   generation for popfl/popfw).  If emit_AC_emwarn is False,
+   next_insn_EIP is unimportant (this allows for easy if kludgey code
+   generation for IRET.) */
+
+static 
+void set_EFLAGS_from_value ( IRTemp t1, 
+                             Bool   emit_AC_emwarn,
+                             Addr32 next_insn_EIP )
+{
+   vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
+
+   /* t1 is the flag word.  Mask out everything except OSZACP and set
+      the flags thunk to X86G_CC_OP_COPY. */
+   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+   stmt( IRStmt_Put( OFFB_CC_DEP1, 
+                     binop(Iop_And32,
+                           mkexpr(t1), 
+                           mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P 
+                                  | X86G_CC_MASK_A | X86G_CC_MASK_Z 
+                                  | X86G_CC_MASK_S| X86G_CC_MASK_O )
+                          )
+                    )
+       );
+   /* Set NDEP even though it isn't used.  This makes redundant-PUT
+      elimination of previous stores to this field work better. */
+   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+
+   /* Also need to set the D flag, which is held in bit 10 of t1.
+      If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
+   stmt( IRStmt_Put( 
+            OFFB_DFLAG,
+            IRExpr_Mux0X( 
+               unop(Iop_32to8,
+                    binop(Iop_And32, 
+                          binop(Iop_Shr32, mkexpr(t1), mkU8(10)), 
+                          mkU32(1))),
+               mkU32(1), 
+               mkU32(0xFFFFFFFF))) 
+       );
+
+   /* Set the ID flag */
+   stmt( IRStmt_Put( 
+            OFFB_IDFLAG,
+            IRExpr_Mux0X( 
+               unop(Iop_32to8,
+                    binop(Iop_And32, 
+                          binop(Iop_Shr32, mkexpr(t1), mkU8(21)), 
+                          mkU32(1))),
+               mkU32(0), 
+               mkU32(1))) 
+       );
+
+   /* And set the AC flag.  If setting it 1 to, possibly emit an
+      emulation warning. */
+   stmt( IRStmt_Put( 
+            OFFB_ACFLAG,
+            IRExpr_Mux0X( 
+               unop(Iop_32to8,
+                    binop(Iop_And32, 
+                          binop(Iop_Shr32, mkexpr(t1), mkU8(18)), 
+                          mkU32(1))),
+               mkU32(0), 
+               mkU32(1))) 
+       );
+
+   if (emit_AC_emwarn) {
+      put_emwarn( mkU32(EmWarn_X86_acFlag) );
+      stmt( 
+         IRStmt_Exit(
+            binop( Iop_CmpNE32, 
+                   binop(Iop_And32, mkexpr(t1), mkU32(1<<18)), 
+                   mkU32(0) ),
+            Ijk_EmWarn,
+            IRConst_U32( next_insn_EIP )
+         )
+      );
+   }
+}
+
+
+/* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
+   values (aa,bb), computes, for each of the 4 16-bit lanes:
+
+   (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
+*/
+static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
+{
+   IRTemp aa      = newTemp(Ity_I64);
+   IRTemp bb      = newTemp(Ity_I64);
+   IRTemp aahi32s = newTemp(Ity_I64);
+   IRTemp aalo32s = newTemp(Ity_I64);
+   IRTemp bbhi32s = newTemp(Ity_I64);
+   IRTemp bblo32s = newTemp(Ity_I64);
+   IRTemp rHi     = newTemp(Ity_I64);
+   IRTemp rLo     = newTemp(Ity_I64);
+   IRTemp one32x2 = newTemp(Ity_I64);
+   assign(aa, aax);
+   assign(bb, bbx);
+   assign( aahi32s,
+           binop(Iop_SarN32x2,
+                 binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
+                 mkU8(16) ));
+   assign( aalo32s,
+           binop(Iop_SarN32x2,
+                 binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
+                 mkU8(16) ));
+   assign( bbhi32s,
+           binop(Iop_SarN32x2,
+                 binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
+                 mkU8(16) ));
+   assign( bblo32s,
+           binop(Iop_SarN32x2,
+                 binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
+                 mkU8(16) ));
+   assign(one32x2, mkU64( (1ULL << 32) + 1 ));
+   assign(
+      rHi,
+      binop(
+         Iop_ShrN32x2,
+         binop(
+            Iop_Add32x2, 
+            binop(
+               Iop_ShrN32x2,
+               binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
+               mkU8(14)
+            ),
+            mkexpr(one32x2)
+         ),
+         mkU8(1)
+      )
+   );
+   assign(
+      rLo,
+      binop(
+         Iop_ShrN32x2,
+         binop(
+            Iop_Add32x2, 
+            binop(
+               Iop_ShrN32x2,
+               binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
+               mkU8(14)
+            ),
+            mkexpr(one32x2)
+         ),
+         mkU8(1)
+      )
+   );
+   return
+      binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
+}
+
+/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
+   values (aa,bb), computes, for each lane:
+
+          if aa_lane < 0 then - bb_lane
+     else if aa_lane > 0 then bb_lane
+     else 0
+*/
+static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
+{
+   IRTemp aa       = newTemp(Ity_I64);
+   IRTemp bb       = newTemp(Ity_I64);
+   IRTemp zero     = newTemp(Ity_I64);
+   IRTemp bbNeg    = newTemp(Ity_I64);
+   IRTemp negMask  = newTemp(Ity_I64);
+   IRTemp posMask  = newTemp(Ity_I64);
+   IROp   opSub    = Iop_INVALID;
+   IROp   opCmpGTS = Iop_INVALID;
+
+   switch (laneszB) {
+      case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
+      case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
+      case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
+      default: vassert(0);
+   }
+
+   assign( aa,      aax );
+   assign( bb,      bbx );
+   assign( zero,    mkU64(0) );
+   assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
+   assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
+   assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
+
+   return
+      binop(Iop_Or64,
+            binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
+            binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
+
+}
+
+/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
+   value aa, computes, for each lane
+
+   if aa < 0 then -aa else aa
+
+   Note that the result is interpreted as unsigned, so that the
+   absolute value of the most negative signed input can be
+   represented.
+*/
+static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
+{
+   IRTemp aa      = newTemp(Ity_I64);
+   IRTemp zero    = newTemp(Ity_I64);
+   IRTemp aaNeg   = newTemp(Ity_I64);
+   IRTemp negMask = newTemp(Ity_I64);
+   IRTemp posMask = newTemp(Ity_I64);
+   IROp   opSub   = Iop_INVALID;
+   IROp   opSarN  = Iop_INVALID;
+
+   switch (laneszB) {
+      case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
+      case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
+      case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
+      default: vassert(0);
+   }
+
+   assign( aa,      aax );
+   assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
+   assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
+   assign( zero,    mkU64(0) );
+   assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
+   return
+      binop(Iop_Or64,
+            binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
+            binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
+}
+
+static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
+                                        IRTemp lo64, Int byteShift )
+{
+   vassert(byteShift >= 1 && byteShift <= 7);
+   return
+      binop(Iop_Or64,
+            binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
+            binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
+      );
+}
+
+/* Generate a SIGSEGV followed by a restart of the current instruction
+   if effective_addr is not 16-aligned.  This is required behaviour
+   for some SSE3 instructions and all 128-bit SSSE3 instructions.
+   This assumes that guest_RIP_curr_instr is set correctly! */
+static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
+{
+   stmt(
+      IRStmt_Exit(
+         binop(Iop_CmpNE32,
+               binop(Iop_And32,mkexpr(effective_addr),mkU32(0xF)),
+               mkU32(0)),
+         Ijk_SigSEGV,
+         IRConst_U32(guest_EIP_curr_instr)
+      )
+   );
+}
+
+
+/* Helper for deciding whether a given insn (starting at the opcode
+   byte) may validly be used with a LOCK prefix.  The following insns
+   may be used with LOCK when their destination operand is in memory.
+   AFAICS this is exactly the same for both 32-bit and 64-bit mode.
+
+   ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
+   OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
+   ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
+   SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
+   AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
+   SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
+   XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
+
+   DEC        FE /1,  FF /1
+   INC        FE /0,  FF /0
+
+   NEG        F6 /3,  F7 /3
+   NOT        F6 /2,  F7 /2
+
+   XCHG       86, 87
+
+   BTC        0F BB,  0F BA /7
+   BTR        0F B3,  0F BA /6
+   BTS        0F AB,  0F BA /5
+
+   CMPXCHG    0F B0,  0F B1
+   CMPXCHG8B  0F C7 /1
+
+   XADD       0F C0,  0F C1
+
+   ------------------------------
+
+   80 /0  =  addb $imm8,  rm8
+   81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
+   82 /0  =  addb $imm8,  rm8
+   83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
+
+   00     =  addb r8,  rm8
+   01     =  addl r32, rm32  and  addw r16, rm16
+
+   Same for ADD OR ADC SBB AND SUB XOR
+
+   FE /1  = dec rm8
+   FF /1  = dec rm32  and  dec rm16
+
+   FE /0  = inc rm8
+   FF /0  = inc rm32  and  inc rm16
+
+   F6 /3  = neg rm8
+   F7 /3  = neg rm32  and  neg rm16
+
+   F6 /2  = not rm8
+   F7 /2  = not rm32  and  not rm16
+
+   0F BB     = btcw r16, rm16    and  btcl r32, rm32
+   OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
+
+   Same for BTS, BTR
+*/
+static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
+{
+   switch (opc[0]) {
+      case 0x00: case 0x01: case 0x08: case 0x09:
+      case 0x10: case 0x11: case 0x18: case 0x19:
+      case 0x20: case 0x21: case 0x28: case 0x29:
+      case 0x30: case 0x31:
+         if (!epartIsReg(opc[1]))
+            return True;
+         break;
+
+      case 0x80: case 0x81: case 0x82: case 0x83:
+         if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
+             && !epartIsReg(opc[1]))
+            return True;
+         break;
+
+      case 0xFE: case 0xFF:
+         if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
+             && !epartIsReg(opc[1]))
+            return True;
+         break;
+
+      case 0xF6: case 0xF7:
+         if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
+             && !epartIsReg(opc[1]))
+            return True;
+         break;
+
+      case 0x86: case 0x87:
+         if (!epartIsReg(opc[1]))
+            return True;
+         break;
+
+      case 0x0F: {
+         switch (opc[1]) {
+            case 0xBB: case 0xB3: case 0xAB:
+               if (!epartIsReg(opc[2]))
+                  return True;
+               break;
+            case 0xBA: 
+               if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
+                   && !epartIsReg(opc[2]))
+                  return True;
+               break;
+            case 0xB0: case 0xB1:
+               if (!epartIsReg(opc[2]))
+                  return True;
+               break;
+            case 0xC7: 
+               if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
+                  return True;
+               break;
+            case 0xC0: case 0xC1:
+               if (!epartIsReg(opc[2]))
+                  return True;
+               break;
+            default:
+               break;
+         } /* switch (opc[1]) */
+         break;
+      }
+
+      default:
+         break;
+   } /* switch (opc[0]) */
+
+   return False;
+}
+
+
+/*------------------------------------------------------------*/
+/*--- Disassemble a single instruction                     ---*/
+/*------------------------------------------------------------*/
+
+/* Disassemble a single instruction into IR.  The instruction is
+   located in host memory at &guest_code[delta].  *expect_CAS is set
+   to True if the resulting IR is expected to contain an IRCAS
+   statement, and False if it's not expected to.  This makes it
+   possible for the caller of disInstr_X86_WRK to check that
+   LOCK-prefixed instructions are at least plausibly translated, in
+   that it becomes possible to check that a (validly) LOCK-prefixed
+   instruction generates a translation containing an IRCAS, and
+   instructions without LOCK prefixes don't generate translations
+   containing an IRCAS.
+*/
+static
+DisResult disInstr_X86_WRK (
+             /*OUT*/Bool* expect_CAS,
+             Bool         put_IP,
+             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
+             Bool         resteerCisOk,
+             void*        callback_opaque,
+             Long         delta64,
+             VexArchInfo* archinfo,
+             VexAbiInfo*  vbi
+          )
+{
+   IRType    ty;
+   IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
+   Int       alen;
+   UChar     opc, modrm, abyte, pre;
+   UInt      d32;
+   HChar     dis_buf[50];
+   Int       am_sz, d_sz, n_prefixes;
+   DisResult dres;
+   UChar*    insn; /* used in SSE decoders */
+
+   /* The running delta */
+   Int delta = (Int)delta64;
+
+   /* Holds eip at the start of the insn, so that we can print
+      consistent error messages for unimplemented insns. */
+   Int delta_start = delta;
+
+   /* sz denotes the nominal data-op size of the insn; we change it to
+      2 if an 0x66 prefix is seen */
+   Int sz = 4;
+
+   /* sorb holds the segment-override-prefix byte, if any.  Zero if no
+      prefix has been seen, else one of {0x26, 0x3E, 0x64, 0x65}
+      indicating the prefix.  */
+   UChar sorb = 0;
+
+   /* Gets set to True if a LOCK prefix is seen. */
+   Bool pfx_lock = False;
+
+   /* Set result defaults. */
+   dres.whatNext   = Dis_Continue;
+   dres.len        = 0;
+   dres.continueAt = 0;
+
+   *expect_CAS = False;
+
+   addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID; 
+
+   vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
+   DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
+
+   /* We may be asked to update the guest EIP before going further. */
+   if (put_IP)
+      stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr)) );
+
+   /* Spot "Special" instructions (see comment at top of file). */
+   {
+      UChar* code = (UChar*)(guest_code + delta);
+      /* Spot the 12-byte preamble:
+         C1C703   roll $3,  %edi
+         C1C70D   roll $13, %edi
+         C1C71D   roll $29, %edi
+         C1C713   roll $19, %edi
+      */
+      if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
+          code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
+          code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
+          code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
+         /* Got a "Special" instruction preamble.  Which one is it? */
+         if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
+            /* %EDX = client_request ( %EAX ) */
+            DIP("%%edx = client_request ( %%eax )\n");
+            delta += 14;
+            jmp_lit(Ijk_ClientReq, guest_EIP_bbstart+delta);
+            dres.whatNext = Dis_StopHere;
+            goto decode_success;
+         }
+         else
+         if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
+            /* %EAX = guest_NRADDR */
+            DIP("%%eax = guest_NRADDR\n");
+            delta += 14;
+            putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
+            goto decode_success;
+         }
+         else
+         if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
+            /* call-noredir *%EAX */
+            DIP("call-noredir *%%eax\n");
+            delta += 14;
+            t1 = newTemp(Ity_I32);
+            assign(t1, getIReg(4,R_EAX));
+            t2 = newTemp(Ity_I32);
+            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
+            putIReg(4, R_ESP, mkexpr(t2));
+            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
+            jmp_treg(Ijk_NoRedir,t1);
+            dres.whatNext = Dis_StopHere;
+            goto decode_success;
+         }
+         /* We don't know what it is. */
+         goto decode_failure;
+         /*NOTREACHED*/
+      }
+   }
+
+   /* Handle a couple of weird-ass NOPs that have been observed in the
+      wild. */
+   {
+      UChar* code = (UChar*)(guest_code + delta);
+      /* Sun's JVM 1.5.0 uses the following as a NOP:
+         26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
+      if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64 
+          && code[3] == 0x65 && code[4] == 0x90) {
+         DIP("%%es:%%cs:%%fs:%%gs:nop\n");
+         delta += 5;
+         goto decode_success;
+      }
+      /* Don't barf on recent binutils padding,
+         all variants of which are: nopw %cs:0x0(%eax,%eax,1)
+         66 2e 0f 1f 84 00 00 00 00 00
+         66 66 2e 0f 1f 84 00 00 00 00 00
+         66 66 66 2e 0f 1f 84 00 00 00 00 00
+         66 66 66 66 2e 0f 1f 84 00 00 00 00 00
+         66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
+         66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
+      */
+      if (code[0] == 0x66) {
+         Int data16_cnt;
+         for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
+            if (code[data16_cnt] != 0x66)
+               break;
+         if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
+             && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
+             && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
+             && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
+             && code[data16_cnt + 8] == 0x00 ) {
+            DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
+            delta += 9 + data16_cnt;
+            goto decode_success;
+         }
+      }
+   }       
+
+   /* Normal instruction handling starts here. */
+
+   /* Deal with some but not all prefixes: 
+         66(oso)
+         F0(lock)
+         2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
+      Not dealt with (left in place):
+         F2 F3
+   */
+   n_prefixes = 0;
+   while (True) {
+      if (n_prefixes > 7) goto decode_failure;
+      pre = getUChar(delta);
+      switch (pre) {
+         case 0x66: 
+            sz = 2;
+            break;
+         case 0xF0: 
+            pfx_lock = True; 
+            *expect_CAS = True;
+            break;
+         case 0x3E: /* %DS: */
+         case 0x26: /* %ES: */
+         case 0x64: /* %FS: */
+         case 0x65: /* %GS: */
+            if (sorb != 0) 
+               goto decode_failure; /* only one seg override allowed */
+            sorb = pre;
+            break;
+         case 0x2E: { /* %CS: */
+            /* 2E prefix on a conditional branch instruction is a
+               branch-prediction hint, which can safely be ignored.  */
+            UChar op1 = getIByte(delta+1);
+            UChar op2 = getIByte(delta+2);
+            if ((op1 >= 0x70 && op1 <= 0x7F)
+                || (op1 == 0xE3)
+                || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
+               if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
+            } else {
+               /* All other CS override cases are not handled */
+               goto decode_failure;
+            }
+            break;
+         }
+         case 0x36: /* %SS: */
+            /* SS override cases are not handled */
+            goto decode_failure;
+         default: 
+            goto not_a_prefix;
+      }
+      n_prefixes++;
+      delta++;
+   }
+
+   not_a_prefix:
+
+   /* Now we should be looking at the primary opcode byte or the
+      leading F2 or F3.  Check that any LOCK prefix is actually
+      allowed. */
+
+   if (pfx_lock) {
+      if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
+         DIP("lock ");
+      } else {
+         *expect_CAS = False;
+         goto decode_failure;
+      }
+   }
+
+
+   /* ---------------------------------------------------- */
+   /* --- The SSE decoder.                             --- */
+   /* ---------------------------------------------------- */
+
+   /* What did I do to deserve SSE ?  Perhaps I was really bad in a
+      previous life? */
+
+   /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
+      later section, further on. */
+
+   insn = (UChar*)&guest_code[delta];
+
+   /* Treat fxsave specially.  It should be doable even on an SSE0
+      (Pentium-II class) CPU.  Hence be prepared to handle it on
+      any subarchitecture variant.
+   */
+
+   /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
+       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
+      IRDirty* d;
+      modrm = getIByte(delta+2);
+      vassert(sz == 4);
+      vassert(!epartIsReg(modrm));
+
+      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+      delta += 2+alen;
+
+      DIP("fxsave %s\n", dis_buf);
+
+      /* Uses dirty helper: 
+            void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
+      d = unsafeIRDirty_0_N ( 
+             0/*regparms*/, 
+             "x86g_dirtyhelper_FXSAVE", 
+             &x86g_dirtyhelper_FXSAVE,
+             mkIRExprVec_1( mkexpr(addr) )
+          );
+      d->needsBBP = True;
+
+      /* declare we're writing memory */
+      d->mFx   = Ifx_Write;
+      d->mAddr = mkexpr(addr);
+      d->mSize = 512;
+
+      /* declare we're reading guest state */
+      d->nFxState = 7;
+
+      d->fxState[0].fx     = Ifx_Read;
+      d->fxState[0].offset = OFFB_FTOP;
+      d->fxState[0].size   = sizeof(UInt);
+
+      d->fxState[1].fx     = Ifx_Read;
+      d->fxState[1].offset = OFFB_FPREGS;
+      d->fxState[1].size   = 8 * sizeof(ULong);
+
+      d->fxState[2].fx     = Ifx_Read;
+      d->fxState[2].offset = OFFB_FPTAGS;
+      d->fxState[2].size   = 8 * sizeof(UChar);
+
+      d->fxState[3].fx     = Ifx_Read;
+      d->fxState[3].offset = OFFB_FPROUND;
+      d->fxState[3].size   = sizeof(UInt);
+
+      d->fxState[4].fx     = Ifx_Read;
+      d->fxState[4].offset = OFFB_FC3210;
+      d->fxState[4].size   = sizeof(UInt);
+
+      d->fxState[5].fx     = Ifx_Read;
+      d->fxState[5].offset = OFFB_XMM0;
+      d->fxState[5].size   = 8 * sizeof(U128);
+
+      d->fxState[6].fx     = Ifx_Read;
+      d->fxState[6].offset = OFFB_SSEROUND;
+      d->fxState[6].size   = sizeof(UInt);
+
+      /* Be paranoid ... this assertion tries to ensure the 8 %xmm
+	 images are packed back-to-back.  If not, the value of
+	 d->fxState[5].size is wrong. */
+      vassert(16 == sizeof(U128));
+      vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
+
+      stmt( IRStmt_Dirty(d) );
+
+      goto decode_success;
+   }
+
+   /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
+       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
+      IRDirty* d;
+      modrm = getIByte(delta+2);
+      vassert(sz == 4);
+      vassert(!epartIsReg(modrm));
+
+      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+      delta += 2+alen;
+
+      DIP("fxrstor %s\n", dis_buf);
+
+      /* Uses dirty helper: 
+            void x86g_do_FXRSTOR ( VexGuestX86State*, UInt ) */
+      d = unsafeIRDirty_0_N ( 
+             0/*regparms*/, 
+             "x86g_dirtyhelper_FXRSTOR", 
+             &x86g_dirtyhelper_FXRSTOR,
+             mkIRExprVec_1( mkexpr(addr) )
+          );
+      d->needsBBP = True;
+
+      /* declare we're reading memory */
+      d->mFx   = Ifx_Read;
+      d->mAddr = mkexpr(addr);
+      d->mSize = 512;
+
+      /* declare we're writing guest state */
+      d->nFxState = 7;
+
+      d->fxState[0].fx     = Ifx_Write;
+      d->fxState[0].offset = OFFB_FTOP;
+      d->fxState[0].size   = sizeof(UInt);
+
+      d->fxState[1].fx     = Ifx_Write;
+      d->fxState[1].offset = OFFB_FPREGS;
+      d->fxState[1].size   = 8 * sizeof(ULong);
+
+      d->fxState[2].fx     = Ifx_Write;
+      d->fxState[2].offset = OFFB_FPTAGS;
+      d->fxState[2].size   = 8 * sizeof(UChar);
+
+      d->fxState[3].fx     = Ifx_Write;
+      d->fxState[3].offset = OFFB_FPROUND;
+      d->fxState[3].size   = sizeof(UInt);
+
+      d->fxState[4].fx     = Ifx_Write;
+      d->fxState[4].offset = OFFB_FC3210;
+      d->fxState[4].size   = sizeof(UInt);
+
+      d->fxState[5].fx     = Ifx_Write;
+      d->fxState[5].offset = OFFB_XMM0;
+      d->fxState[5].size   = 8 * sizeof(U128);
+
+      d->fxState[6].fx     = Ifx_Write;
+      d->fxState[6].offset = OFFB_SSEROUND;
+      d->fxState[6].size   = sizeof(UInt);
+
+      /* Be paranoid ... this assertion tries to ensure the 8 %xmm
+	 images are packed back-to-back.  If not, the value of
+	 d->fxState[5].size is wrong. */
+      vassert(16 == sizeof(U128));
+      vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
+
+      stmt( IRStmt_Dirty(d) );
+
+      goto decode_success;
+   }
+
+   /* ------ SSE decoder main ------ */
+
+   /* Skip parts of the decoder which don't apply given the stated
+      guest subarchitecture. */
+   if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
+      goto after_sse_decoders;
+   
+   /* Otherwise we must be doing sse1 or sse2, so we can at least try
+      for SSE1 here. */
+
+   /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x58) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F 55 = ANDNPS -- G = (not G) and E */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x55) {
+      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
+      goto decode_success;
+   }
+
+   /* 0F 54 = ANDPS -- G = G and E */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x54) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
+      goto decode_success;
+   }
+
+   /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC2) {
+      delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
+      goto decode_success;
+   }
+
+   /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
+      vassert(sz == 4);
+      delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
+      goto decode_success;
+   }
+
+   /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
+   /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
+   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
+      IRTemp argL = newTemp(Ity_F32);
+      IRTemp argR = newTemp(Ity_F32);
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
+         delta += 2+1;
+         DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                  nameXMMReg(gregOfRM(modrm)) );
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("[u]comiss %s,%s\n", dis_buf,
+                                  nameXMMReg(gregOfRM(modrm)) );
+      }
+      assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
+
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+      stmt( IRStmt_Put( 
+               OFFB_CC_DEP1,
+               binop( Iop_And32,
+                      binop(Iop_CmpF64, 
+                            unop(Iop_F32toF64,mkexpr(argL)),
+                            unop(Iop_F32toF64,mkexpr(argR))),
+                      mkU32(0x45)
+          )));
+      /* Set NDEP even though it isn't used.  This makes redundant-PUT
+         elimination of previous stores to this field work better. */
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+      goto decode_success;
+   }
+
+   /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
+      half xmm */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x2A) {
+      IRTemp arg64 = newTemp(Ity_I64);
+      IRTemp rmode = newTemp(Ity_I32);
+      vassert(sz == 4);
+
+      modrm = getIByte(delta+2);
+      do_MMX_preamble();
+      if (epartIsReg(modrm)) {
+         assign( arg64, getMMXReg(eregOfRM(modrm)) );
+         delta += 2+1;
+         DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvtpi2ps %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)) );
+      }
+
+      assign( rmode, get_sse_roundingmode() );
+
+      putXMMRegLane32F( 
+         gregOfRM(modrm), 0,
+         binop(Iop_F64toF32, 
+               mkexpr(rmode),
+               unop(Iop_I32StoF64, 
+                    unop(Iop_64to32, mkexpr(arg64)) )) );
+
+      putXMMRegLane32F(
+         gregOfRM(modrm), 1, 
+         binop(Iop_F64toF32, 
+               mkexpr(rmode),
+               unop(Iop_I32StoF64,
+                    unop(Iop_64HIto32, mkexpr(arg64)) )) );
+
+      goto decode_success;
+   }
+
+   /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
+      quarter xmm */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
+      IRTemp arg32 = newTemp(Ity_I32);
+      IRTemp rmode = newTemp(Ity_I32);
+      vassert(sz == 4);
+
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         assign( arg32, getIReg(4, eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("cvtsi2ss %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)) );
+      }
+
+      assign( rmode, get_sse_roundingmode() );
+
+      putXMMRegLane32F( 
+         gregOfRM(modrm), 0,
+         binop(Iop_F64toF32,
+               mkexpr(rmode),
+               unop(Iop_I32StoF64, mkexpr(arg32)) ) );
+
+      goto decode_success;
+   }
+
+   /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
+      I32 in mmx, according to prevailing SSE rounding mode */
+   /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
+      I32 in mmx, rounding towards zero */
+   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
+      IRTemp dst64  = newTemp(Ity_I64);
+      IRTemp rmode  = newTemp(Ity_I32);
+      IRTemp f32lo  = newTemp(Ity_F32);
+      IRTemp f32hi  = newTemp(Ity_F32);
+      Bool   r2zero = toBool(insn[1] == 0x2C);
+
+      do_MMX_preamble();
+      modrm = getIByte(delta+2);
+
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
+	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
+         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
+                                   nameXMMReg(eregOfRM(modrm)),
+                                   nameMMXReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
+	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32, 
+                                              mkexpr(addr), 
+                                              mkU32(4) )));
+         delta += 2+alen;
+         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
+                                   dis_buf,
+                                   nameMMXReg(gregOfRM(modrm)));
+      }
+
+      if (r2zero) {
+         assign(rmode, mkU32((UInt)Irrm_ZERO) );
+      } else {
+         assign( rmode, get_sse_roundingmode() );
+      }
+
+      assign( 
+         dst64,
+         binop( Iop_32HLto64,
+                binop( Iop_F64toI32S, 
+                       mkexpr(rmode), 
+                       unop( Iop_F32toF64, mkexpr(f32hi) ) ),
+                binop( Iop_F64toI32S, 
+                       mkexpr(rmode), 
+                       unop( Iop_F32toF64, mkexpr(f32lo) ) )
+              )
+      );
+
+      putMMXReg(gregOfRM(modrm), mkexpr(dst64));
+      goto decode_success;
+   }
+
+   /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
+      I32 in ireg, according to prevailing SSE rounding mode */
+   /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
+      I32 in ireg, rounding towards zero */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F 
+       && (insn[2] == 0x2D || insn[2] == 0x2C)) {
+      IRTemp rmode = newTemp(Ity_I32);
+      IRTemp f32lo = newTemp(Ity_F32);
+      Bool   r2zero = toBool(insn[2] == 0x2C);
+      vassert(sz == 4);
+
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         delta += 3+1;
+	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
+         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
+                                   nameXMMReg(eregOfRM(modrm)),
+                                   nameIReg(4, gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
+         delta += 3+alen;
+         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
+                                   dis_buf,
+                                   nameIReg(4, gregOfRM(modrm)));
+      }
+
+      if (r2zero) {
+         assign( rmode, mkU32((UInt)Irrm_ZERO) );
+      } else {
+         assign( rmode, get_sse_roundingmode() );
+      }
+
+      putIReg(4, gregOfRM(modrm),
+                 binop( Iop_F64toI32S, 
+                        mkexpr(rmode), 
+                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
+      );
+
+      goto decode_success;
+   }
+
+   /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5E) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
+   if (insn[0] == 0x0F && insn[1] == 0xAE
+       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
+
+      IRTemp t64 = newTemp(Ity_I64);
+      IRTemp ew = newTemp(Ity_I32);
+
+      modrm = getIByte(delta+2);
+      vassert(!epartIsReg(modrm));
+      vassert(sz == 4);
+
+      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+      delta += 2+alen;
+      DIP("ldmxcsr %s\n", dis_buf);
+
+      /* The only thing we observe in %mxcsr is the rounding mode.
+         Therefore, pass the 32-bit value (SSE native-format control
+         word) to a clean helper, getting back a 64-bit value, the
+         lower half of which is the SSEROUND value to store, and the
+         upper half of which is the emulation-warning token which may
+         be generated.  
+      */
+      /* ULong x86h_check_ldmxcsr ( UInt ); */
+      assign( t64, mkIRExprCCall(
+                      Ity_I64, 0/*regparms*/, 
+                      "x86g_check_ldmxcsr",
+                      &x86g_check_ldmxcsr, 
+                      mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
+                   )
+            );
+
+      put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
+      assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
+      put_emwarn( mkexpr(ew) );
+      /* Finally, if an emulation warning was reported, side-exit to
+         the next insn, reporting the warning, so that Valgrind's
+         dispatcher sees the warning. */
+      stmt( 
+         IRStmt_Exit(
+            binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
+            Ijk_EmWarn,
+            IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
+         )
+      );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F F7 = MASKMOVQ -- 8x8 masked store */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
+      Bool ok = False;
+      delta = dis_MMX( &ok, sorb, sz, delta+1 );
+      if (!ok)
+         goto decode_failure;
+      goto decode_success;
+   }
+
+   /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
+   /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
+   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         putXMMReg( gregOfRM(modrm), 
+                    getXMMReg( eregOfRM(modrm) ));
+         DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                  nameXMMReg(gregOfRM(modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         if (insn[1] == 0x28/*movaps*/)
+            gen_SEGV_if_not_16_aligned( addr );
+         putXMMReg( gregOfRM(modrm), 
+                    loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("mov[ua]ps %s,%s\n", dis_buf,
+                                  nameXMMReg(gregOfRM(modrm)));
+         delta += 2+alen;
+      }
+      goto decode_success;
+   }
+
+   /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
+   /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
+   if (sz == 4 && insn[0] == 0x0F 
+       && (insn[1] == 0x29 || insn[1] == 0x11)) {
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         /* fall through; awaiting test case */
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         if (insn[1] == 0x29/*movaps*/)
+            gen_SEGV_if_not_16_aligned( addr );
+         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
+         DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
+                                  dis_buf );
+         delta += 2+alen;
+         goto decode_success;
+      }
+   }
+
+   /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
+   /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
+                          getXMMRegLane64( eregOfRM(modrm), 0 ) );
+         DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)), 
+                               nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         delta += 2+alen;
+         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
+                          loadLE(Ity_I64, mkexpr(addr)) );
+         DIP("movhps %s,%s\n", dis_buf, 
+                               nameXMMReg( gregOfRM(modrm) ));
+      }
+      goto decode_success;
+   }
+
+   /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
+      if (!epartIsReg(insn[2])) {
+         delta += 2;
+         addr = disAMode ( &alen, sorb, delta, dis_buf );
+         delta += alen;
+         storeLE( mkexpr(addr), 
+                  getXMMRegLane64( gregOfRM(insn[2]),
+                                   1/*upper lane*/ ) );
+         DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
+                               dis_buf);
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
+   /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         putXMMRegLane64( gregOfRM(modrm),  
+                          0/*lower lane*/,
+                          getXMMRegLane64( eregOfRM(modrm), 1 ));
+         DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)), 
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         delta += 2+alen;
+         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
+                          loadLE(Ity_I64, mkexpr(addr)) );
+         DIP("movlps %s, %s\n", 
+             dis_buf, nameXMMReg( gregOfRM(modrm) ));
+      }
+      goto decode_success;
+   }
+
+   /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
+      if (!epartIsReg(insn[2])) {
+         delta += 2;
+         addr = disAMode ( &alen, sorb, delta, dis_buf );
+         delta += alen;
+         storeLE( mkexpr(addr), 
+                  getXMMRegLane64( gregOfRM(insn[2]), 
+                                   0/*lower lane*/ ) );
+         DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
+                                dis_buf);
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
+      to 4 lowest bits of ireg(G) */
+   if (insn[0] == 0x0F && insn[1] == 0x50) {
+      modrm = getIByte(delta+2);
+      if (sz == 4 && epartIsReg(modrm)) {
+         Int src;
+         t0 = newTemp(Ity_I32);
+         t1 = newTemp(Ity_I32);
+         t2 = newTemp(Ity_I32);
+         t3 = newTemp(Ity_I32);
+         delta += 2+1;
+         src = eregOfRM(modrm);
+         assign( t0, binop( Iop_And32,
+                            binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
+                            mkU32(1) ));
+         assign( t1, binop( Iop_And32,
+                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
+                            mkU32(2) ));
+         assign( t2, binop( Iop_And32,
+                            binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
+                            mkU32(4) ));
+         assign( t3, binop( Iop_And32,
+                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
+                            mkU32(8) ));
+         putIReg(4, gregOfRM(modrm),
+                    binop(Iop_Or32,
+                          binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
+                          binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
+                         )
+                 );
+         DIP("movmskps %s,%s\n", nameXMMReg(src), 
+                                 nameIReg(4, gregOfRM(modrm)));
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
+   /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
+   if (insn[0] == 0x0F && insn[1] == 0x2B) {
+      modrm = getIByte(delta+2);
+      if (!epartIsReg(modrm)) {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         gen_SEGV_if_not_16_aligned( addr );
+         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
+         DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
+                                 dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)));
+         delta += 2+alen;
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
+      Intel manual does not say anything about the usual business of
+      the FP reg tags getting trashed whenever an MMX insn happens.
+      So we just leave them alone. 
+   */
+   if (insn[0] == 0x0F && insn[1] == 0xE7) {
+      modrm = getIByte(delta+2);
+      if (sz == 4 && !epartIsReg(modrm)) {
+         /* do_MMX_preamble(); Intel docs don't specify this */
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
+         DIP("movntq %s,%s\n", dis_buf,
+                               nameMMXReg(gregOfRM(modrm)));
+         delta += 2+alen;
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
+      (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
+      vassert(sz == 4);
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         putXMMRegLane32( gregOfRM(modrm), 0,
+                          getXMMRegLane32( eregOfRM(modrm), 0 ));
+         DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                              nameXMMReg(gregOfRM(modrm)));
+         delta += 3+1;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         /* zero bits 127:64 */
+         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) ); 
+         /* zero bits 63:32 */
+         putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) ); 
+         /* write bits 31:0 */
+         putXMMRegLane32( gregOfRM(modrm), 0,
+                          loadLE(Ity_I32, mkexpr(addr)) );
+         DIP("movss %s,%s\n", dis_buf,
+                              nameXMMReg(gregOfRM(modrm)));
+         delta += 3+alen;
+      }
+      goto decode_success;
+   }
+
+   /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
+      or lo 1/4 xmm). */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
+      vassert(sz == 4);
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         /* fall through, we don't yet have a test case */
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         storeLE( mkexpr(addr),
+                  getXMMRegLane32(gregOfRM(modrm), 0) );
+         DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
+                              dis_buf);
+         delta += 3+alen;
+         goto decode_success;
+      }
+   }
+
+   /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F 56 = ORPS -- G = G and E */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                sorb, delta+2, insn[1], "pavgb", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE3) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                sorb, delta+2, insn[1], "pavgw", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put 
+      zero-extend of it in ireg(G). */
+   if (insn[0] == 0x0F && insn[1] == 0xC5) {
+      modrm = insn[2];
+      if (sz == 4 && epartIsReg(modrm)) {
+         IRTemp sV = newTemp(Ity_I64);
+         t5 = newTemp(Ity_I16);
+         do_MMX_preamble();
+         assign(sV, getMMXReg(eregOfRM(modrm)));
+         breakup64to16s( sV, &t3, &t2, &t1, &t0 );
+         switch (insn[3] & 3) {
+            case 0:  assign(t5, mkexpr(t0)); break;
+            case 1:  assign(t5, mkexpr(t1)); break;
+            case 2:  assign(t5, mkexpr(t2)); break;
+            case 3:  assign(t5, mkexpr(t3)); break;
+            default: vassert(0); /*NOTREACHED*/
+         }
+         putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
+         DIP("pextrw $%d,%s,%s\n",
+             (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
+                           nameIReg(4,gregOfRM(modrm)));
+         delta += 4;
+         goto decode_success;
+      } 
+      /* else fall through */
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
+      put it into the specified lane of mmx(G). */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC4) {
+      /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
+         mmx reg.  t4 is the new lane value.  t5 is the original
+         mmx value. t6 is the new mmx value. */
+      Int lane;
+      t4 = newTemp(Ity_I16);
+      t5 = newTemp(Ity_I64);
+      t6 = newTemp(Ity_I64);
+      modrm = insn[2];
+      do_MMX_preamble();
+
+      assign(t5, getMMXReg(gregOfRM(modrm)));
+      breakup64to16s( t5, &t3, &t2, &t1, &t0 );
+
+      if (epartIsReg(modrm)) {
+         assign(t4, getIReg(2, eregOfRM(modrm)));
+         delta += 3+1;
+         lane = insn[3+1-1];
+         DIP("pinsrw $%d,%s,%s\n", (Int)lane, 
+                                   nameIReg(2,eregOfRM(modrm)),
+                                   nameMMXReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         delta += 3+alen;
+         lane = insn[3+alen-1];
+         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
+         DIP("pinsrw $%d,%s,%s\n", (Int)lane, 
+                                   dis_buf,
+                                   nameMMXReg(gregOfRM(modrm)));
+      }
+
+      switch (lane & 3) {
+         case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
+         case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
+         case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
+         case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
+         default: vassert(0); /*NOTREACHED*/
+      }
+      putMMXReg(gregOfRM(modrm), mkexpr(t6));
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F EE = PMAXSW -- 16x4 signed max */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEE) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                sorb, delta+2, insn[1], "pmaxsw", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F DE = PMAXUB -- 8x8 unsigned max */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDE) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                sorb, delta+2, insn[1], "pmaxub", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F EA = PMINSW -- 16x4 signed min */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEA) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                sorb, delta+2, insn[1], "pminsw", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F DA = PMINUB -- 8x8 unsigned min */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDA) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                sorb, delta+2, insn[1], "pminub", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
+      mmx(G), turn them into a byte, and put zero-extend of it in
+      ireg(G). */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         do_MMX_preamble();
+         t0 = newTemp(Ity_I64);
+         t1 = newTemp(Ity_I32);
+         assign(t0, getMMXReg(eregOfRM(modrm)));
+         assign(t1, mkIRExprCCall(
+                       Ity_I32, 0/*regparms*/, 
+                       "x86g_calculate_mmx_pmovmskb",
+                       &x86g_calculate_mmx_pmovmskb,
+                       mkIRExprVec_1(mkexpr(t0))));
+         putIReg(4, gregOfRM(modrm), mkexpr(t1));
+         DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
+                                 nameIReg(4,gregOfRM(modrm)));
+         delta += 3;
+         goto decode_success;
+      } 
+      /* else fall through */
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE4) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                sorb, delta+2, insn[1], "pmuluh", False );
+      goto decode_success;
+   }
+
+   /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
+   /* 0F 18 /1 = PREFETCH0   -- with various different hints */
+   /* 0F 18 /2 = PREFETCH1 */
+   /* 0F 18 /3 = PREFETCH2 */
+   if (insn[0] == 0x0F && insn[1] == 0x18
+       && !epartIsReg(insn[2]) 
+       && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
+      HChar* hintstr = "??";
+
+      modrm = getIByte(delta+2);
+      vassert(!epartIsReg(modrm));
+
+      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+      delta += 2+alen;
+
+      switch (gregOfRM(modrm)) {
+         case 0: hintstr = "nta"; break;
+         case 1: hintstr = "t0"; break;
+         case 2: hintstr = "t1"; break;
+         case 3: hintstr = "t2"; break;
+         default: vassert(0); /*NOTREACHED*/
+      }
+
+      DIP("prefetch%s %s\n", hintstr, dis_buf);
+      goto decode_success;
+   }
+
+   /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
+   /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
+   if (insn[0] == 0x0F && insn[1] == 0x0D
+       && !epartIsReg(insn[2]) 
+       && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
+      HChar* hintstr = "??";
+
+      modrm = getIByte(delta+2);
+      vassert(!epartIsReg(modrm));
+
+      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+      delta += 2+alen;
+
+      switch (gregOfRM(modrm)) {
+         case 0: hintstr = ""; break;
+         case 1: hintstr = "w"; break;
+         default: vassert(0); /*NOTREACHED*/
+      }
+
+      DIP("prefetch%s %s\n", hintstr, dis_buf);
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF6) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                 sorb, delta+2, insn[1], "psadbw", False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
+   /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x70) {
+      Int order;
+      IRTemp sV, dV, s3, s2, s1, s0;
+      s3 = s2 = s1 = s0 = IRTemp_INVALID;
+      sV = newTemp(Ity_I64);
+      dV = newTemp(Ity_I64);
+      do_MMX_preamble();
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregOfRM(modrm)) );
+         order = (Int)insn[3];
+         delta += 2+2;
+         DIP("pshufw $%d,%s,%s\n", order, 
+                                   nameMMXReg(eregOfRM(modrm)),
+                                   nameMMXReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+	 order = (Int)insn[2+alen];
+         delta += 3+alen;
+         DIP("pshufw $%d,%s,%s\n", order, 
+                                   dis_buf,
+                                   nameMMXReg(gregOfRM(modrm)));
+      }
+      breakup64to16s( sV, &s3, &s2, &s1, &s0 );
+
+#     define SEL(n) \
+                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
+      assign(dV,
+	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
+                          SEL((order>>2)&3), SEL((order>>0)&3) )
+      );
+      putMMXReg(gregOfRM(modrm), mkexpr(dV));
+#     undef SEL
+      goto decode_success;
+   }
+
+   /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
+   if (insn[0] == 0x0F && insn[1] == 0x53) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2, 
+                                        "rcpps", Iop_Recip32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3, 
+                                         "rcpss", Iop_Recip32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
+   if (insn[0] == 0x0F && insn[1] == 0x52) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2, 
+                                        "rsqrtps", Iop_RSqrt32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3, 
+                                         "rsqrtss", Iop_RSqrt32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F AE /7 = SFENCE -- flush pending operations to memory */
+   if (insn[0] == 0x0F && insn[1] == 0xAE
+       && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
+      vassert(sz == 4);
+      delta += 3;
+      /* Insert a memory fence.  It's sometimes important that these
+         are carried through to the generated code. */
+      stmt( IRStmt_MBE(Imbe_Fence) );
+      DIP("sfence\n");
+      goto decode_success;
+   }
+
+   /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
+      Int    select;
+      IRTemp sV, dV;
+      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
+      sV = newTemp(Ity_V128);
+      dV = newTemp(Ity_V128);
+      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
+      modrm = insn[2];
+      assign( dV, getXMMReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         select = (Int)insn[3];
+         delta += 2+2;
+         DIP("shufps $%d,%s,%s\n", select, 
+                                   nameXMMReg(eregOfRM(modrm)),
+                                   nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         select = (Int)insn[2+alen];
+         delta += 3+alen;
+         DIP("shufps $%d,%s,%s\n", select, 
+                                   dis_buf,
+                                   nameXMMReg(gregOfRM(modrm)));
+      }
+
+      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
+      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+
+#     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
+#     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
+
+      putXMMReg(
+         gregOfRM(modrm), 
+         mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3), 
+                       SELD((select>>2)&3), SELD((select>>0)&3) )
+      );
+
+#     undef SELD
+#     undef SELS
+
+      goto decode_success;
+   }
+
+   /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x51) {
+      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2, 
+                                        "sqrtps", Iop_Sqrt32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3, 
+                                         "sqrtss", Iop_Sqrt32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
+   if (insn[0] == 0x0F && insn[1] == 0xAE
+       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
+      modrm = getIByte(delta+2);
+      vassert(sz == 4);
+      vassert(!epartIsReg(modrm));
+
+      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+      delta += 2+alen;
+
+      /* Fake up a native SSE mxcsr word.  The only thing it depends
+         on is SSEROUND[1:0], so call a clean helper to cook it up. 
+      */
+      /* UInt x86h_create_mxcsr ( UInt sseround ) */
+      DIP("stmxcsr %s\n", dis_buf);
+      storeLE( mkexpr(addr), 
+               mkIRExprCCall(
+                  Ity_I32, 0/*regp*/,
+                  "x86g_create_mxcsr", &x86g_create_mxcsr, 
+                  mkIRExprVec_1( get_sse_roundingmode() ) 
+               ) 
+             );
+      goto decode_success;
+   }
+
+   /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5C) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
+      goto decode_success;
+   }
+
+   /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
+      goto decode_success;
+   }
+
+   /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
+   /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
+   /* These just appear to be special cases of SHUFPS */
+   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
+      IRTemp sV, dV;
+      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
+      Bool hi = toBool(insn[1] == 0x15);
+      sV = newTemp(Ity_V128);
+      dV = newTemp(Ity_V128);
+      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
+      modrm = insn[2];
+      assign( dV, getXMMReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         delta += 2+1;
+         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
+                                  nameXMMReg(eregOfRM(modrm)),
+                                  nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
+                                  dis_buf,
+                                  nameXMMReg(gregOfRM(modrm)));
+      }
+
+      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
+      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+
+      if (hi) {
+         putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
+      } else {
+         putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
+      }
+
+      goto decode_success;
+   }
+
+   /* 0F 57 = XORPS -- G = G and E */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x57) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
+      goto decode_success;
+   }
+
+   /* ---------------------------------------------------- */
+   /* --- end of the SSE decoder.                      --- */
+   /* ---------------------------------------------------- */
+
+   /* ---------------------------------------------------- */
+   /* --- start of the SSE2 decoder.                   --- */
+   /* ---------------------------------------------------- */
+
+   /* Skip parts of the decoder which don't apply given the stated
+      guest subarchitecture. */
+   if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
+      goto after_sse_decoders; /* no SSE2 capabilities */
+
+   insn = (UChar*)&guest_code[delta];
+
+   /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x58) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
+      goto decode_success;
+   }
+ 
+   /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 55 = ANDNPD -- G = (not G) and E */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x55) {
+      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F 54 = ANDPD -- G = G and E */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x54) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC2) {
+      delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
+      goto decode_success;
+   }
+
+   /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
+      vassert(sz == 4);
+      delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
+      goto decode_success;
+   }
+
+   /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
+   /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
+   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
+      IRTemp argL = newTemp(Ity_F64);
+      IRTemp argR = newTemp(Ity_F64);
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
+         delta += 2+1;
+         DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                  nameXMMReg(gregOfRM(modrm)) );
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("[u]comisd %s,%s\n", dis_buf,
+                                  nameXMMReg(gregOfRM(modrm)) );
+      }
+      assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
+
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+      stmt( IRStmt_Put( 
+               OFFB_CC_DEP1,
+               binop( Iop_And32,
+                      binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
+                      mkU32(0x45)
+          )));
+      /* Set NDEP even though it isn't used.  This makes redundant-PUT
+         elimination of previous stores to this field work better. */
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+      goto decode_success;
+   }
+
+   /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
+      F64 in xmm(G) */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
+      IRTemp arg64 = newTemp(Ity_I64);
+      vassert(sz == 4);
+
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
+         delta += 3+1;
+         DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("cvtdq2pd %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)) );
+      }
+
+      putXMMRegLane64F( 
+         gregOfRM(modrm), 0,
+         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
+      );
+
+      putXMMRegLane64F(
+         gregOfRM(modrm), 1, 
+         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
+      );
+
+      goto decode_success;
+   }
+
+   /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
+      xmm(G) */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5B) {
+      IRTemp argV  = newTemp(Ity_V128);
+      IRTemp rmode = newTemp(Ity_I32);
+
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( argV, getXMMReg(eregOfRM(modrm)) );
+         delta += 2+1;
+         DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvtdq2ps %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)) );
+      }
+         
+      assign( rmode, get_sse_roundingmode() );
+      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
+
+#     define CVT(_t)  binop( Iop_F64toF32,                    \
+                             mkexpr(rmode),                   \
+                             unop(Iop_I32StoF64,mkexpr(_t)))
+      
+      putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
+      putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
+      putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
+      putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
+
+#     undef CVT
+
+      goto decode_success;
+   }
+
+   /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
+      lo half xmm(G), and zero upper half */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
+      IRTemp argV  = newTemp(Ity_V128);
+      IRTemp rmode = newTemp(Ity_I32);
+      vassert(sz == 4);
+
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         assign( argV, getXMMReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("cvtpd2dq %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)) );
+      }
+         
+      assign( rmode, get_sse_roundingmode() );
+      t0 = newTemp(Ity_F64);
+      t1 = newTemp(Ity_F64);
+      assign( t0, unop(Iop_ReinterpI64asF64, 
+                       unop(Iop_V128to64, mkexpr(argV))) );
+      assign( t1, unop(Iop_ReinterpI64asF64, 
+                       unop(Iop_V128HIto64, mkexpr(argV))) );
+      
+#     define CVT(_t)  binop( Iop_F64toI32S,                   \
+                             mkexpr(rmode),                   \
+                             mkexpr(_t) )
+      
+      putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
+      putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
+      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
+      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
+
+#     undef CVT
+
+      goto decode_success;
+   }
+
+   /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
+      I32 in mmx, according to prevailing SSE rounding mode */
+   /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
+      I32 in mmx, rounding towards zero */
+   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
+      IRTemp dst64  = newTemp(Ity_I64);
+      IRTemp rmode  = newTemp(Ity_I32);
+      IRTemp f64lo  = newTemp(Ity_F64);
+      IRTemp f64hi  = newTemp(Ity_F64);
+      Bool   r2zero = toBool(insn[1] == 0x2C);
+
+      do_MMX_preamble();
+      modrm = getIByte(delta+2);
+
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
+	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
+         DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
+                                   nameXMMReg(eregOfRM(modrm)),
+                                   nameMMXReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
+	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32, 
+                                              mkexpr(addr), 
+                                              mkU32(8) )));
+         delta += 2+alen;
+         DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
+                                   dis_buf,
+                                   nameMMXReg(gregOfRM(modrm)));
+      }
+
+      if (r2zero) {
+         assign(rmode, mkU32((UInt)Irrm_ZERO) );
+      } else {
+         assign( rmode, get_sse_roundingmode() );
+      }
+
+      assign( 
+         dst64,
+         binop( Iop_32HLto64,
+                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
+                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
+              )
+      );
+
+      putMMXReg(gregOfRM(modrm), mkexpr(dst64));
+      goto decode_success;
+   }
+
+   /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
+      lo half xmm(G), and zero upper half */
+   /* Note, this is practically identical to CVTPD2DQ.  It would have
+      been nicer to merge them together, but the insn[] offsets differ
+      by one. */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5A) {
+      IRTemp argV  = newTemp(Ity_V128);
+      IRTemp rmode = newTemp(Ity_I32);
+
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( argV, getXMMReg(eregOfRM(modrm)) );
+         delta += 2+1;
+         DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvtpd2ps %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)) );
+      }
+         
+      assign( rmode, get_sse_roundingmode() );
+      t0 = newTemp(Ity_F64);
+      t1 = newTemp(Ity_F64);
+      assign( t0, unop(Iop_ReinterpI64asF64, 
+                       unop(Iop_V128to64, mkexpr(argV))) );
+      assign( t1, unop(Iop_ReinterpI64asF64, 
+                       unop(Iop_V128HIto64, mkexpr(argV))) );
+      
+#     define CVT(_t)  binop( Iop_F64toF32,                    \
+                             mkexpr(rmode),                   \
+                             mkexpr(_t) )
+      
+      putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
+      putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
+      putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
+      putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
+
+#     undef CVT
+
+      goto decode_success;
+   }
+
+   /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
+      xmm(G) */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x2A) {
+      IRTemp arg64 = newTemp(Ity_I64);
+
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         /* Only switch to MMX mode if the source is a MMX register.
+            This is inconsistent with all other instructions which
+            convert between XMM and (M64 or MMX), which always switch
+            to MMX mode even if 64-bit operand is M64 and not MMX.  At
+            least, that's what the Intel docs seem to me to say.
+            Fixes #210264. */
+         do_MMX_preamble();
+         assign( arg64, getMMXReg(eregOfRM(modrm)) );
+         delta += 2+1;
+         DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvtpi2pd %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)) );
+      }
+
+      putXMMRegLane64F( 
+         gregOfRM(modrm), 0,
+         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
+      );
+
+      putXMMRegLane64F( 
+         gregOfRM(modrm), 1,
+         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
+      );
+
+      goto decode_success;
+   }
+
+   /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
+      xmm(G) */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5B) {
+      IRTemp argV  = newTemp(Ity_V128);
+      IRTemp rmode = newTemp(Ity_I32);
+
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( argV, getXMMReg(eregOfRM(modrm)) );
+         delta += 2+1;
+         DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvtps2dq %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)) );
+      }
+         
+      assign( rmode, get_sse_roundingmode() );
+      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
+
+      /* This is less than ideal.  If it turns out to be a performance
+	 bottleneck it can be improved. */
+#     define CVT(_t)                            \
+        binop( Iop_F64toI32S,                   \
+               mkexpr(rmode),                   \
+               unop( Iop_F32toF64,              \
+                     unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
+      
+      putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
+      putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
+      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
+      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
+
+#     undef CVT
+
+      goto decode_success;
+   }
+
+   /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
+      F64 in xmm(G). */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5A) {
+      IRTemp f32lo = newTemp(Ity_F32);
+      IRTemp f32hi = newTemp(Ity_F32);
+
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
+         assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
+         delta += 2+1;
+         DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
+	 assign( f32hi, loadLE(Ity_F32, 
+                               binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
+         delta += 2+alen;
+         DIP("cvtps2pd %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)) );
+      }
+
+      putXMMRegLane64F( gregOfRM(modrm), 1,
+                        unop(Iop_F32toF64, mkexpr(f32hi)) );
+      putXMMRegLane64F( gregOfRM(modrm), 0,
+                        unop(Iop_F32toF64, mkexpr(f32lo)) );
+
+      goto decode_success;
+   }
+
+   /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
+      I32 in ireg, according to prevailing SSE rounding mode */
+   /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
+      I32 in ireg, rounding towards zero */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F 
+       && (insn[2] == 0x2D || insn[2] == 0x2C)) {
+      IRTemp rmode = newTemp(Ity_I32);
+      IRTemp f64lo = newTemp(Ity_F64);
+      Bool   r2zero = toBool(insn[2] == 0x2C);
+      vassert(sz == 4);
+
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         delta += 3+1;
+	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
+         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
+                                   nameXMMReg(eregOfRM(modrm)),
+                                   nameIReg(4, gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
+         delta += 3+alen;
+         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
+                                   dis_buf,
+                                   nameIReg(4, gregOfRM(modrm)));
+      }
+
+      if (r2zero) {
+         assign( rmode, mkU32((UInt)Irrm_ZERO) );
+      } else {
+         assign( rmode, get_sse_roundingmode() );
+      }
+
+      putIReg(4, gregOfRM(modrm),
+                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
+
+      goto decode_success;
+   }
+
+   /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
+      low 1/4 xmm(G), according to prevailing SSE rounding mode */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
+      IRTemp rmode = newTemp(Ity_I32);
+      IRTemp f64lo = newTemp(Ity_F64);
+      vassert(sz == 4);
+
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         delta += 3+1;
+	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
+         DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
+         delta += 3+alen;
+         DIP("cvtsd2ss %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)));
+      }
+
+      assign( rmode, get_sse_roundingmode() );
+      putXMMRegLane32F( 
+         gregOfRM(modrm), 0, 
+         binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
+      );
+
+      goto decode_success;
+   }
+
+   /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
+      half xmm */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
+      IRTemp arg32 = newTemp(Ity_I32);
+      vassert(sz == 4);
+
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         assign( arg32, getIReg(4, eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("cvtsi2sd %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)) );
+      }
+
+      putXMMRegLane64F( 
+         gregOfRM(modrm), 0,
+         unop(Iop_I32StoF64, mkexpr(arg32)) );
+
+      goto decode_success;
+   }
+
+   /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
+      low half xmm(G) */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
+      IRTemp f32lo = newTemp(Ity_F32);
+      vassert(sz == 4);
+
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         delta += 3+1;
+	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
+         DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
+         delta += 3+alen;
+         DIP("cvtss2sd %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)));
+      }
+
+      putXMMRegLane64F( gregOfRM(modrm), 0, 
+                        unop( Iop_F32toF64, mkexpr(f32lo) ) );
+
+      goto decode_success;
+   }
+
+   /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
+      lo half xmm(G), and zero upper half, rounding towards zero */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE6) {
+      IRTemp argV  = newTemp(Ity_V128);
+      IRTemp rmode = newTemp(Ity_I32);
+
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         assign( argV, getXMMReg(eregOfRM(modrm)) );
+         delta += 2+1;
+         DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                  nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("cvttpd2dq %s,%s\n", dis_buf,
+                                  nameXMMReg(gregOfRM(modrm)) );
+      }
+
+      assign( rmode, mkU32((UInt)Irrm_ZERO) );
+
+      t0 = newTemp(Ity_F64);
+      t1 = newTemp(Ity_F64);
+      assign( t0, unop(Iop_ReinterpI64asF64, 
+                       unop(Iop_V128to64, mkexpr(argV))) );
+      assign( t1, unop(Iop_ReinterpI64asF64, 
+                       unop(Iop_V128HIto64, mkexpr(argV))) );
+      
+#     define CVT(_t)  binop( Iop_F64toI32S,                   \
+                             mkexpr(rmode),                   \
+                             mkexpr(_t) )
+      
+      putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
+      putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
+      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
+      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
+
+#     undef CVT
+
+      goto decode_success;
+   }
+
+   /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
+      xmm(G), rounding towards zero */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
+      IRTemp argV  = newTemp(Ity_V128);
+      IRTemp rmode = newTemp(Ity_I32);
+      vassert(sz == 4);
+
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         assign( argV, getXMMReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                  nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("cvttps2dq %s,%s\n", dis_buf,
+                                  nameXMMReg(gregOfRM(modrm)) );
+      }
+         
+      assign( rmode, mkU32((UInt)Irrm_ZERO) );
+      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
+
+      /* This is less than ideal.  If it turns out to be a performance
+	 bottleneck it can be improved. */
+#     define CVT(_t)                            \
+        binop( Iop_F64toI32S,                   \
+               mkexpr(rmode),                   \
+               unop( Iop_F32toF64,              \
+                     unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
+      
+      putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
+      putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
+      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
+      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
+
+#     undef CVT
+
+      goto decode_success;
+   }
+
+   /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5E) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
+      goto decode_success;
+   }
+
+   /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
+      goto decode_success;
+   }
+
+   /* 0F AE /5 = LFENCE -- flush pending operations to memory */
+   /* 0F AE /6 = MFENCE -- flush pending operations to memory */
+   if (insn[0] == 0x0F && insn[1] == 0xAE
+       && epartIsReg(insn[2]) 
+       && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
+      vassert(sz == 4);
+      delta += 3;
+      /* Insert a memory fence.  It's sometimes important that these
+         are carried through to the generated code. */
+      stmt( IRStmt_MBE(Imbe_Fence) );
+      DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
+      goto decode_success;
+   }
+
+   /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5F) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
+      goto decode_success;
+   }
+
+   /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5D) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
+      goto decode_success;
+   }
+
+   /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
+   /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
+   /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
+   if (sz == 2 && insn[0] == 0x0F 
+       && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
+      HChar* wot = insn[1]==0x28 ? "apd" :
+                   insn[1]==0x10 ? "upd" : "dqa";
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         putXMMReg( gregOfRM(modrm), 
+                    getXMMReg( eregOfRM(modrm) ));
+         DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
+                                   nameXMMReg(gregOfRM(modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
+            gen_SEGV_if_not_16_aligned( addr );
+         putXMMReg( gregOfRM(modrm), 
+                    loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("mov%s %s,%s\n", wot, dis_buf,
+                                   nameXMMReg(gregOfRM(modrm)));
+         delta += 2+alen;
+      }
+      goto decode_success;
+   }
+
+   /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
+   /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
+   if (sz == 2 && insn[0] == 0x0F 
+       && (insn[1] == 0x29 || insn[1] == 0x11)) {
+      HChar* wot = insn[1]==0x29 ? "apd" : "upd";
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         /* fall through; awaiting test case */
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         if (insn[1] == 0x29/*movapd*/)
+            gen_SEGV_if_not_16_aligned( addr );
+         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
+         DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
+                                   dis_buf );
+         delta += 2+alen;
+         goto decode_success;
+      }
+   }
+
+   /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6E) {
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         putXMMReg(
+            gregOfRM(modrm),
+            unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) ) 
+         );
+         DIP("movd %s, %s\n", 
+             nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode( &alen, sorb, delta+2, dis_buf );
+         delta += 2+alen;
+         putXMMReg(
+            gregOfRM(modrm),
+            unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) ) 
+         );
+         DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
+      }
+      goto decode_success;
+   }
+
+   /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7E) {
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         putIReg( 4, eregOfRM(modrm),
+                  getXMMRegLane32(gregOfRM(modrm), 0) );
+         DIP("movd %s, %s\n", 
+             nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
+      } else {
+         addr = disAMode( &alen, sorb, delta+2, dis_buf );
+         delta += 2+alen;
+         storeLE( mkexpr(addr),
+                  getXMMRegLane32(gregOfRM(modrm), 0) );
+         DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
+      }
+      goto decode_success;
+   }
+
+   /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7F) {
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         delta += 2+1;
+         putXMMReg( eregOfRM(modrm),
+                    getXMMReg(gregOfRM(modrm)) );
+         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), 
+                                nameXMMReg(eregOfRM(modrm)));
+      } else {
+         addr = disAMode( &alen, sorb, delta+2, dis_buf );
+         delta += 2+alen;
+         gen_SEGV_if_not_16_aligned( addr );
+         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
+         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
+      }
+      goto decode_success;
+   }
+
+   /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
+   /* Unfortunately can't simply use the MOVDQA case since the
+      prefix lengths are different (66 vs F3) */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
+      vassert(sz == 4);
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         putXMMReg( gregOfRM(modrm), 
+                    getXMMReg( eregOfRM(modrm) ));
+         DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                               nameXMMReg(gregOfRM(modrm)));
+         delta += 3+1;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         putXMMReg( gregOfRM(modrm), 
+                    loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("movdqu %s,%s\n", dis_buf,
+                               nameXMMReg(gregOfRM(modrm)));
+         delta += 3+alen;
+      }
+      goto decode_success;
+   }
+
+   /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
+   /* Unfortunately can't simply use the MOVDQA case since the
+      prefix lengths are different (66 vs F3) */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
+      vassert(sz == 4);
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         delta += 3+1;
+         putXMMReg( eregOfRM(modrm),
+                    getXMMReg(gregOfRM(modrm)) );
+         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), 
+                                nameXMMReg(eregOfRM(modrm)));
+      } else {
+         addr = disAMode( &alen, sorb, delta+3, dis_buf );
+         delta += 3+alen;
+         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
+         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
+      }
+      goto decode_success;
+   }
+
+   /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
+      vassert(sz == 4);
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         do_MMX_preamble();
+         putMMXReg( gregOfRM(modrm), 
+                    getXMMRegLane64( eregOfRM(modrm), 0 ));
+         DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                nameMMXReg(gregOfRM(modrm)));
+         delta += 3+1;
+         goto decode_success;
+      } else {
+         /* fall through, apparently no mem case for this insn */
+      }
+   }
+
+   /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
+   /* These seems identical to MOVHPS.  This instruction encoding is
+      completely crazy. */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x16) {
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         /* fall through; apparently reg-reg is not possible */
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         delta += 2+alen;
+         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
+                          loadLE(Ity_I64, mkexpr(addr)) );
+         DIP("movhpd %s,%s\n", dis_buf, 
+                               nameXMMReg( gregOfRM(modrm) ));
+         goto decode_success;
+      }
+   }
+
+   /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
+   /* Again, this seems identical to MOVHPS. */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x17) {
+      if (!epartIsReg(insn[2])) {
+         delta += 2;
+         addr = disAMode ( &alen, sorb, delta, dis_buf );
+         delta += alen;
+         storeLE( mkexpr(addr), 
+                  getXMMRegLane64( gregOfRM(insn[2]),
+                                   1/*upper lane*/ ) );
+         DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
+                               dis_buf);
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
+   /* Identical to MOVLPS ? */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x12) {
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         /* fall through; apparently reg-reg is not possible */
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         delta += 2+alen;
+         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
+                          loadLE(Ity_I64, mkexpr(addr)) );
+         DIP("movlpd %s, %s\n", 
+             dis_buf, nameXMMReg( gregOfRM(modrm) ));
+         goto decode_success;
+      }
+   }
+
+   /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
+   /* Identical to MOVLPS ? */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x13) {
+      if (!epartIsReg(insn[2])) {
+         delta += 2;
+         addr = disAMode ( &alen, sorb, delta, dis_buf );
+         delta += alen;
+         storeLE( mkexpr(addr), 
+                  getXMMRegLane64( gregOfRM(insn[2]), 
+                                   0/*lower lane*/ ) );
+         DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
+                                dis_buf);
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
+      2 lowest bits of ireg(G) */
+   if (insn[0] == 0x0F && insn[1] == 0x50) {
+      modrm = getIByte(delta+2);
+      if (sz == 2 && epartIsReg(modrm)) {
+         Int src;
+         t0 = newTemp(Ity_I32);
+         t1 = newTemp(Ity_I32);
+         delta += 2+1;
+         src = eregOfRM(modrm);
+         assign( t0, binop( Iop_And32,
+                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
+                            mkU32(1) ));
+         assign( t1, binop( Iop_And32,
+                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
+                            mkU32(2) ));
+         putIReg(4, gregOfRM(modrm),
+                    binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
+                 );
+         DIP("movmskpd %s,%s\n", nameXMMReg(src), 
+                                 nameIReg(4, gregOfRM(modrm)));
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
+   if (insn[0] == 0x0F && insn[1] == 0xF7) {
+      modrm = getIByte(delta+2);
+      if (sz == 2 && epartIsReg(modrm)) {
+         IRTemp regD    = newTemp(Ity_V128);
+         IRTemp mask    = newTemp(Ity_V128);
+         IRTemp olddata = newTemp(Ity_V128);
+         IRTemp newdata = newTemp(Ity_V128);
+                addr    = newTemp(Ity_I32);
+
+         assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
+         assign( regD, getXMMReg( gregOfRM(modrm) ));
+
+         /* Unfortunately can't do the obvious thing with SarN8x16
+            here since that can't be re-emitted as SSE2 code - no such
+            insn. */
+	 assign( 
+            mask, 
+            binop(Iop_64HLtoV128,
+                  binop(Iop_SarN8x8, 
+                        getXMMRegLane64( eregOfRM(modrm), 1 ), 
+                        mkU8(7) ),
+                  binop(Iop_SarN8x8, 
+                        getXMMRegLane64( eregOfRM(modrm), 0 ), 
+                        mkU8(7) ) ));
+         assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
+         assign( newdata, 
+                 binop(Iop_OrV128, 
+                       binop(Iop_AndV128, 
+                             mkexpr(regD), 
+                             mkexpr(mask) ),
+                       binop(Iop_AndV128, 
+                             mkexpr(olddata),
+                             unop(Iop_NotV128, mkexpr(mask)))) );
+         storeLE( mkexpr(addr), mkexpr(newdata) );
+
+         delta += 2+1;
+         DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
+                                   nameXMMReg( gregOfRM(modrm) ) );
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
+   if (insn[0] == 0x0F && insn[1] == 0xE7) {
+      modrm = getIByte(delta+2);
+      if (sz == 2 && !epartIsReg(modrm)) {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         gen_SEGV_if_not_16_aligned( addr );
+         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
+         DIP("movntdq %s,%s\n", dis_buf,
+                                nameXMMReg(gregOfRM(modrm)));
+         delta += 2+alen;
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
+   if (insn[0] == 0x0F && insn[1] == 0xC3) {
+      vassert(sz == 4);
+      modrm = getIByte(delta+2);
+      if (!epartIsReg(modrm)) {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
+         DIP("movnti %s,%s\n", dis_buf,
+                               nameIReg(4, gregOfRM(modrm)));
+         delta += 2+alen;
+         goto decode_success;
+      }
+      /* else fall through */
+   }
+
+   /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
+      or lo half xmm).  */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD6) {
+      modrm = getIByte(delta+2);
+      if (epartIsReg(modrm)) {
+         /* fall through, awaiting test case */
+         /* dst: lo half copied, hi half zeroed */
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         storeLE( mkexpr(addr), 
+                  getXMMRegLane64( gregOfRM(modrm), 0 ));
+         DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
+         delta += 2+alen;
+         goto decode_success;
+      }
+   }
+
+   /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
+      hi half). */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
+      vassert(sz == 4);
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         do_MMX_preamble();
+         putXMMReg( gregOfRM(modrm), 
+                    unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
+         DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
+                                nameXMMReg(gregOfRM(modrm)));
+         delta += 3+1;
+         goto decode_success;
+      } else {
+         /* fall through, apparently no mem case for this insn */
+      }
+   }
+
+   /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
+      G (lo half xmm).  Upper half of G is zeroed out. */
+   /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
+      G (lo half xmm).  If E is mem, upper half of G is zeroed out.
+      If E is reg, upper half of G is unchanged. */
+   if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
+       || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
+      vassert(sz == 4);
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         putXMMRegLane64( gregOfRM(modrm), 0,
+                          getXMMRegLane64( eregOfRM(modrm), 0 ));
+         if (insn[0] == 0xF3/*MOVQ*/) {
+            /* zero bits 127:64 */
+            putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
+         }
+         DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                              nameXMMReg(gregOfRM(modrm)));
+         delta += 3+1;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         /* zero bits 127:64 */
+         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
+         /* write bits 63:0 */
+         putXMMRegLane64( gregOfRM(modrm), 0,
+                          loadLE(Ity_I64, mkexpr(addr)) );
+         DIP("movsd %s,%s\n", dis_buf,
+                              nameXMMReg(gregOfRM(modrm)));
+         delta += 3+alen;
+      }
+      goto decode_success;
+   }
+
+   /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
+      or lo half xmm). */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
+      vassert(sz == 4);
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         putXMMRegLane64( eregOfRM(modrm), 0,
+                          getXMMRegLane64( gregOfRM(modrm), 0 ));
+         DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
+                              nameXMMReg(eregOfRM(modrm)));
+         delta += 3+1;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         storeLE( mkexpr(addr),
+                  getXMMRegLane64(gregOfRM(modrm), 0) );
+         DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
+                              dis_buf);
+         delta += 3+alen;
+      }
+      goto decode_success;
+   }
+
+   /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x59) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
+      goto decode_success;
+   }
+
+   /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 56 = ORPD -- G = G and E */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x56) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
+      Int    select;
+      IRTemp sV = newTemp(Ity_V128);
+      IRTemp dV = newTemp(Ity_V128);
+      IRTemp s1 = newTemp(Ity_I64);
+      IRTemp s0 = newTemp(Ity_I64);
+      IRTemp d1 = newTemp(Ity_I64);
+      IRTemp d0 = newTemp(Ity_I64);
+
+      modrm = insn[2];
+      assign( dV, getXMMReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         select = (Int)insn[3];
+         delta += 2+2;
+         DIP("shufpd $%d,%s,%s\n", select, 
+                                   nameXMMReg(eregOfRM(modrm)),
+                                   nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         select = (Int)insn[2+alen];
+         delta += 3+alen;
+         DIP("shufpd $%d,%s,%s\n", select, 
+                                   dis_buf,
+                                   nameXMMReg(gregOfRM(modrm)));
+      }
+
+      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
+
+#     define SELD(n) mkexpr((n)==0 ? d0 : d1)
+#     define SELS(n) mkexpr((n)==0 ? s0 : s1)
+
+      putXMMReg(
+         gregOfRM(modrm), 
+         binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
+      );
+
+#     undef SELD
+#     undef SELS
+
+      goto decode_success;
+   }
+
+   /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x51) {
+      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2, 
+                                        "sqrtpd", Iop_Sqrt64Fx2 );
+      goto decode_success;
+   }
+
+   /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3, 
+                                         "sqrtsd", Iop_Sqrt64F0x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5C) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
+      goto decode_success;
+   }
+
+   /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
+      vassert(sz == 4);
+      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
+   /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
+   /* These just appear to be special cases of SHUFPS */
+   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
+      IRTemp s1 = newTemp(Ity_I64);
+      IRTemp s0 = newTemp(Ity_I64);
+      IRTemp d1 = newTemp(Ity_I64);
+      IRTemp d0 = newTemp(Ity_I64);
+      IRTemp sV = newTemp(Ity_V128);
+      IRTemp dV = newTemp(Ity_V128);
+      Bool   hi = toBool(insn[1] == 0x15);
+
+      modrm = insn[2];
+      assign( dV, getXMMReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         delta += 2+1;
+         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
+                                  nameXMMReg(eregOfRM(modrm)),
+                                  nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
+                                  dis_buf,
+                                  nameXMMReg(gregOfRM(modrm)));
+      }
+
+      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      if (hi) {
+         putXMMReg( gregOfRM(modrm), 
+                    binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
+      } else {
+         putXMMReg( gregOfRM(modrm), 
+                    binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
+      }
+
+      goto decode_success;
+   }
+
+   /* 66 0F 57 = XORPD -- G = G and E */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x57) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F 6B = PACKSSDW */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6B) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "packssdw", Iop_QNarrow32Sx4, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 63 = PACKSSWB */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x63) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "packsswb", Iop_QNarrow16Sx8, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 67 = PACKUSWB */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x67) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "packuswb", Iop_QNarrow16Ux8, True );
+      goto decode_success;
+   }
+
+   /* 66 0F FC = PADDB */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFC) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "paddb", Iop_Add8x16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F FE = PADDD */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFE) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "paddd", Iop_Add32x4, False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
+   /* 0F D4 = PADDQ -- add 64x1 */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD4) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                sorb, delta+2, insn[1], "paddq", False );
+      goto decode_success;
+   }
+
+   /* 66 0F D4 = PADDQ */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD4) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "paddq", Iop_Add64x2, False );
+      goto decode_success;
+   }
+
+   /* 66 0F FD = PADDW */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFD) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "paddw", Iop_Add16x8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F EC = PADDSB */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEC) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "paddsb", Iop_QAdd8Sx16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F ED = PADDSW */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xED) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "paddsw", Iop_QAdd16Sx8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F DC = PADDUSB */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDC) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "paddusb", Iop_QAdd8Ux16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F DD = PADDUSW */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDD) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "paddusw", Iop_QAdd16Ux8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F DB = PAND */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDB) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F DF = PANDN */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDF) {
+      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F E0 = PAVGB */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE0) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pavgb", Iop_Avg8Ux16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F E3 = PAVGW */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE3) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pavgw", Iop_Avg16Ux8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 74 = PCMPEQB */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x74) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pcmpeqb", Iop_CmpEQ8x16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 76 = PCMPEQD */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x76) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pcmpeqd", Iop_CmpEQ32x4, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 75 = PCMPEQW */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x75) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pcmpeqw", Iop_CmpEQ16x8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 64 = PCMPGTB */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x64) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pcmpgtb", Iop_CmpGT8Sx16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 66 = PCMPGTD */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x66) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pcmpgtd", Iop_CmpGT32Sx4, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 65 = PCMPGTW */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x65) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pcmpgtw", Iop_CmpGT16Sx8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put 
+      zero-extend of it in ireg(G). */
+   if (insn[0] == 0x0F && insn[1] == 0xC5) {
+      modrm = insn[2];
+      if (sz == 2 && epartIsReg(modrm)) {
+         t5 = newTemp(Ity_V128);
+         t4 = newTemp(Ity_I16);
+         assign(t5, getXMMReg(eregOfRM(modrm)));
+         breakup128to32s( t5, &t3, &t2, &t1, &t0 );
+         switch (insn[3] & 7) {
+            case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
+            case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
+            case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
+            case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
+            case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
+            case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
+            case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
+            case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
+            default: vassert(0); /*NOTREACHED*/
+         }
+         putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
+         DIP("pextrw $%d,%s,%s\n",
+             (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
+                           nameIReg(4,gregOfRM(modrm)));
+         delta += 4;
+         goto decode_success;
+      } 
+      /* else fall through */
+   }
+
+   /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
+      put it into the specified lane of xmm(G). */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC4) {
+      Int lane;
+      t4 = newTemp(Ity_I16);
+      modrm = insn[2];
+
+      if (epartIsReg(modrm)) {
+         assign(t4, getIReg(2, eregOfRM(modrm)));
+         delta += 3+1;
+         lane = insn[3+1-1];
+         DIP("pinsrw $%d,%s,%s\n", (Int)lane, 
+                                   nameIReg(2,eregOfRM(modrm)),
+                                   nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         delta += 3+alen;
+         lane = insn[3+alen-1];
+         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
+         DIP("pinsrw $%d,%s,%s\n", (Int)lane, 
+                                   dis_buf,
+                                   nameXMMReg(gregOfRM(modrm)));
+      }
+
+      putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
+      goto decode_success;
+   }
+
+   /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
+      E(xmm or mem) to G(xmm) */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF5) {
+      IRTemp s1V  = newTemp(Ity_V128);
+      IRTemp s2V  = newTemp(Ity_V128);
+      IRTemp dV   = newTemp(Ity_V128);
+      IRTemp s1Hi = newTemp(Ity_I64);
+      IRTemp s1Lo = newTemp(Ity_I64);
+      IRTemp s2Hi = newTemp(Ity_I64);
+      IRTemp s2Lo = newTemp(Ity_I64);
+      IRTemp dHi  = newTemp(Ity_I64);
+      IRTemp dLo  = newTemp(Ity_I64);
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( s1V, getXMMReg(eregOfRM(modrm)) );
+         delta += 2+1;
+         DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("pmaddwd %s,%s\n", dis_buf,
+                                nameXMMReg(gregOfRM(modrm)));
+      }
+      assign( s2V, getXMMReg(gregOfRM(modrm)) );
+      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
+      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
+      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
+      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
+      assign( dHi, mkIRExprCCall(
+                      Ity_I64, 0/*regparms*/,
+                      "x86g_calculate_mmx_pmaddwd", 
+                      &x86g_calculate_mmx_pmaddwd,
+                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
+                   ));
+      assign( dLo, mkIRExprCCall(
+                      Ity_I64, 0/*regparms*/,
+                      "x86g_calculate_mmx_pmaddwd", 
+                      &x86g_calculate_mmx_pmaddwd,
+                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
+                   ));
+      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
+      putXMMReg(gregOfRM(modrm), mkexpr(dV));
+      goto decode_success;
+   }
+
+   /* 66 0F EE = PMAXSW -- 16x8 signed max */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pmaxsw", Iop_Max16Sx8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pmaxub", Iop_Max8Ux16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F EA = PMINSW -- 16x8 signed min */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pminsw", Iop_Min16Sx8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F DA = PMINUB -- 8x16 unsigned min */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pminub", Iop_Min8Ux16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in
+      xmm(G), turn them into a byte, and put zero-extend of it in
+      ireg(G).  Doing this directly is just too cumbersome; give up
+      therefore and call a helper. */
+   /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         t0 = newTemp(Ity_I64);
+         t1 = newTemp(Ity_I64);
+         assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
+         assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
+         t5 = newTemp(Ity_I32);
+         assign(t5, mkIRExprCCall(
+                       Ity_I32, 0/*regparms*/, 
+                       "x86g_calculate_sse_pmovmskb",
+                       &x86g_calculate_sse_pmovmskb,
+                       mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
+         putIReg(4, gregOfRM(modrm), mkexpr(t5));
+         DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                 nameIReg(4,gregOfRM(modrm)));
+         delta += 3;
+         goto decode_success;
+      } 
+      /* else fall through */
+   }
+
+   /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pmulhuw", Iop_MulHi16Ux8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pmulhw", Iop_MulHi16Sx8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F D5 = PMULHL -- 16x8 multiply */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "pmullw", Iop_Mul16x8, False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
+   /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
+      0 to form 64-bit result */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) {
+      IRTemp sV = newTemp(Ity_I64);
+      IRTemp dV = newTemp(Ity_I64);
+      t1 = newTemp(Ity_I32);
+      t0 = newTemp(Ity_I32);
+      modrm = insn[2];
+
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregOfRM(modrm)) );
+         delta += 2+1;
+         DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
+                                nameMMXReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("pmuludq %s,%s\n", dis_buf,
+                                nameMMXReg(gregOfRM(modrm)));
+      }
+
+      assign( t0, unop(Iop_64to32, mkexpr(dV)) );
+      assign( t1, unop(Iop_64to32, mkexpr(sV)) );
+      putMMXReg( gregOfRM(modrm),
+                 binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
+      goto decode_success;
+   }
+
+   /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
+      0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
+      half */
+   /* This is a really poor translation -- could be improved if
+      performance critical */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) {
+      IRTemp sV, dV;
+      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
+      sV = newTemp(Ity_V128);
+      dV = newTemp(Ity_V128);
+      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
+      t1 = newTemp(Ity_I64);
+      t0 = newTemp(Ity_I64);
+      modrm = insn[2];
+      assign( dV, getXMMReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         delta += 2+1;
+         DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("pmuludq %s,%s\n", dis_buf,
+                                nameXMMReg(gregOfRM(modrm)));
+      }
+
+      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
+      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+
+      assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
+      putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
+      assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
+      putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
+      goto decode_success;
+   }
+
+   /* 66 0F EB = POR */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEB) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
+      goto decode_success;
+   }
+
+   /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
+      from E(xmm or mem) to G(xmm) */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF6) {
+      IRTemp s1V  = newTemp(Ity_V128);
+      IRTemp s2V  = newTemp(Ity_V128);
+      IRTemp dV   = newTemp(Ity_V128);
+      IRTemp s1Hi = newTemp(Ity_I64);
+      IRTemp s1Lo = newTemp(Ity_I64);
+      IRTemp s2Hi = newTemp(Ity_I64);
+      IRTemp s2Lo = newTemp(Ity_I64);
+      IRTemp dHi  = newTemp(Ity_I64);
+      IRTemp dLo  = newTemp(Ity_I64);
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( s1V, getXMMReg(eregOfRM(modrm)) );
+         delta += 2+1;
+         DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                               nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 2+alen;
+         DIP("psadbw %s,%s\n", dis_buf,
+                               nameXMMReg(gregOfRM(modrm)));
+      }
+      assign( s2V, getXMMReg(gregOfRM(modrm)) );
+      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
+      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
+      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
+      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
+      assign( dHi, mkIRExprCCall(
+                      Ity_I64, 0/*regparms*/,
+                      "x86g_calculate_mmx_psadbw", 
+                      &x86g_calculate_mmx_psadbw,
+                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
+                   ));
+      assign( dLo, mkIRExprCCall(
+                      Ity_I64, 0/*regparms*/,
+                      "x86g_calculate_mmx_psadbw", 
+                      &x86g_calculate_mmx_psadbw,
+                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
+                   ));
+      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
+      putXMMReg(gregOfRM(modrm), mkexpr(dV));
+      goto decode_success;
+   }
+
+   /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) {
+      Int order;
+      IRTemp sV, dV, s3, s2, s1, s0;
+      s3 = s2 = s1 = s0 = IRTemp_INVALID;
+      sV = newTemp(Ity_V128);
+      dV = newTemp(Ity_V128);
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         order = (Int)insn[3];
+         delta += 2+2;
+         DIP("pshufd $%d,%s,%s\n", order, 
+                                   nameXMMReg(eregOfRM(modrm)),
+                                   nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+	 order = (Int)insn[2+alen];
+         delta += 3+alen;
+         DIP("pshufd $%d,%s,%s\n", order, 
+                                   dis_buf,
+                                   nameXMMReg(gregOfRM(modrm)));
+      }
+      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+
+#     define SEL(n) \
+                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
+      assign(dV,
+	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
+                           SEL((order>>2)&3), SEL((order>>0)&3) )
+      );
+      putXMMReg(gregOfRM(modrm), mkexpr(dV));
+#     undef SEL
+      goto decode_success;
+   }
+
+   /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
+      mem) to G(xmm), and copy lower half */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
+      Int order;
+      IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
+      s3 = s2 = s1 = s0 = IRTemp_INVALID;
+      sV   = newTemp(Ity_V128);
+      dV   = newTemp(Ity_V128);
+      sVhi = newTemp(Ity_I64);
+      dVhi = newTemp(Ity_I64);
+      modrm = insn[3];
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         order = (Int)insn[4];
+         delta += 4+1;
+         DIP("pshufhw $%d,%s,%s\n", order, 
+                                    nameXMMReg(eregOfRM(modrm)),
+                                    nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+	 order = (Int)insn[3+alen];
+         delta += 4+alen;
+         DIP("pshufhw $%d,%s,%s\n", order, 
+                                    dis_buf,
+                                    nameXMMReg(gregOfRM(modrm)));
+      }
+      assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
+
+#     define SEL(n) \
+                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
+      assign(dVhi,
+	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
+                          SEL((order>>2)&3), SEL((order>>0)&3) )
+      );
+      assign(dV, binop( Iop_64HLtoV128, 
+                        mkexpr(dVhi),
+                        unop(Iop_V128to64, mkexpr(sV))) );
+      putXMMReg(gregOfRM(modrm), mkexpr(dV));
+#     undef SEL
+      goto decode_success;
+   }
+
+   /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
+      mem) to G(xmm), and copy upper half */
+   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
+      Int order;
+      IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
+      s3 = s2 = s1 = s0 = IRTemp_INVALID;
+      sV   = newTemp(Ity_V128);
+      dV   = newTemp(Ity_V128);
+      sVlo = newTemp(Ity_I64);
+      dVlo = newTemp(Ity_I64);
+      modrm = insn[3];
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         order = (Int)insn[4];
+         delta += 4+1;
+         DIP("pshuflw $%d,%s,%s\n", order, 
+                                    nameXMMReg(eregOfRM(modrm)),
+                                    nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+	 order = (Int)insn[3+alen];
+         delta += 4+alen;
+         DIP("pshuflw $%d,%s,%s\n", order, 
+                                    dis_buf,
+                                    nameXMMReg(gregOfRM(modrm)));
+      }
+      assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
+      breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
+
+#     define SEL(n) \
+                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
+      assign(dVlo,
+	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
+                          SEL((order>>2)&3), SEL((order>>0)&3) )
+      );
+      assign(dV, binop( Iop_64HLtoV128,
+                        unop(Iop_V128HIto64, mkexpr(sV)),
+                        mkexpr(dVlo) ) );
+      putXMMReg(gregOfRM(modrm), mkexpr(dV));
+#     undef SEL
+      goto decode_success;
+   }
+
+   /* 66 0F 72 /6 ib = PSLLD by immediate */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
+       && epartIsReg(insn[2])
+       && gregOfRM(insn[2]) == 6) {
+      delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
+      goto decode_success;
+   }
+
+   /* 66 0F F2 = PSLLD by E */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) {
+      delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
+      goto decode_success;
+   }
+
+   /* 66 0F 73 /7 ib = PSLLDQ by immediate */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
+       && epartIsReg(insn[2])
+       && gregOfRM(insn[2]) == 7) {
+      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
+      Int    imm = (Int)insn[3];
+      Int    reg = eregOfRM(insn[2]);
+      DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
+      vassert(imm >= 0 && imm <= 255);
+      delta += 4;
+
+      sV    = newTemp(Ity_V128);
+      dV    = newTemp(Ity_V128);
+      hi64  = newTemp(Ity_I64);
+      lo64  = newTemp(Ity_I64);
+      hi64r = newTemp(Ity_I64);
+      lo64r = newTemp(Ity_I64);
+
+      if (imm >= 16) {
+         putXMMReg(reg, mkV128(0x0000));
+         goto decode_success;
+      }
+
+      assign( sV, getXMMReg(reg) );
+      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
+
+      if (imm == 0) {
+         assign( lo64r, mkexpr(lo64) );
+         assign( hi64r, mkexpr(hi64) );
+      }
+      else
+      if (imm == 8) {
+         assign( lo64r, mkU64(0) );
+         assign( hi64r, mkexpr(lo64) );
+      }
+      else
+      if (imm > 8) {
+         assign( lo64r, mkU64(0) );
+         assign( hi64r, binop( Iop_Shl64, 
+                               mkexpr(lo64),
+                               mkU8( 8*(imm-8) ) ));
+      } else {
+         assign( lo64r, binop( Iop_Shl64, 
+                               mkexpr(lo64),
+                               mkU8(8 * imm) ));
+         assign( hi64r, 
+                 binop( Iop_Or64,
+                        binop(Iop_Shl64, mkexpr(hi64), 
+                                         mkU8(8 * imm)),
+                        binop(Iop_Shr64, mkexpr(lo64),
+                                         mkU8(8 * (8 - imm)) )
+                      )
+               );
+      }
+      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
+      putXMMReg(reg, mkexpr(dV));
+      goto decode_success;
+   }
+
+   /* 66 0F 73 /6 ib = PSLLQ by immediate */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
+       && epartIsReg(insn[2])
+       && gregOfRM(insn[2]) == 6) {
+      delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F F3 = PSLLQ by E */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) {
+      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 71 /6 ib = PSLLW by immediate */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
+       && epartIsReg(insn[2])
+       && gregOfRM(insn[2]) == 6) {
+      delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
+      goto decode_success;
+   }
+
+   /* 66 0F F1 = PSLLW by E */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) {
+      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
+      goto decode_success;
+   }
+
+   /* 66 0F 72 /4 ib = PSRAD by immediate */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
+       && epartIsReg(insn[2])
+       && gregOfRM(insn[2]) == 4) {
+      delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
+      goto decode_success;
+   }
+
+   /* 66 0F E2 = PSRAD by E */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) {
+      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
+      goto decode_success;
+   }
+
+   /* 66 0F 71 /4 ib = PSRAW by immediate */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
+       && epartIsReg(insn[2])
+       && gregOfRM(insn[2]) == 4) {
+      delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
+      goto decode_success;
+   }
+
+   /* 66 0F E1 = PSRAW by E */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) {
+      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
+      goto decode_success;
+   }
+
+   /* 66 0F 72 /2 ib = PSRLD by immediate */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
+       && epartIsReg(insn[2])
+       && gregOfRM(insn[2]) == 2) {
+      delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
+      goto decode_success;
+   }
+
+   /* 66 0F D2 = PSRLD by E */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) {
+      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
+      goto decode_success;
+   }
+
+   /* 66 0F 73 /3 ib = PSRLDQ by immediate */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
+       && epartIsReg(insn[2])
+       && gregOfRM(insn[2]) == 3) {
+      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
+      Int    imm = (Int)insn[3];
+      Int    reg = eregOfRM(insn[2]);
+      DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
+      vassert(imm >= 0 && imm <= 255);
+      delta += 4;
+
+      sV    = newTemp(Ity_V128);
+      dV    = newTemp(Ity_V128);
+      hi64  = newTemp(Ity_I64);
+      lo64  = newTemp(Ity_I64);
+      hi64r = newTemp(Ity_I64);
+      lo64r = newTemp(Ity_I64);
+
+      if (imm >= 16) {
+         putXMMReg(reg, mkV128(0x0000));
+         goto decode_success;
+      }
+
+      assign( sV, getXMMReg(reg) );
+      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
+
+      if (imm == 0) {
+         assign( lo64r, mkexpr(lo64) );
+         assign( hi64r, mkexpr(hi64) );
+      }
+      else
+      if (imm == 8) {
+         assign( hi64r, mkU64(0) );
+         assign( lo64r, mkexpr(hi64) );
+      }
+      else 
+      if (imm > 8) {
+         assign( hi64r, mkU64(0) );
+         assign( lo64r, binop( Iop_Shr64, 
+                               mkexpr(hi64),
+                               mkU8( 8*(imm-8) ) ));
+      } else {
+         assign( hi64r, binop( Iop_Shr64, 
+                               mkexpr(hi64),
+                               mkU8(8 * imm) ));
+         assign( lo64r, 
+                 binop( Iop_Or64,
+                        binop(Iop_Shr64, mkexpr(lo64), 
+                                         mkU8(8 * imm)),
+                        binop(Iop_Shl64, mkexpr(hi64),
+                                         mkU8(8 * (8 - imm)) )
+                      )
+               );
+      }
+
+      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
+      putXMMReg(reg, mkexpr(dV));
+      goto decode_success;
+   }
+
+   /* 66 0F 73 /2 ib = PSRLQ by immediate */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
+       && epartIsReg(insn[2])
+       && gregOfRM(insn[2]) == 2) {
+      delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F D3 = PSRLQ by E */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) {
+      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
+      goto decode_success;
+   }
+
+   /* 66 0F 71 /2 ib = PSRLW by immediate */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
+       && epartIsReg(insn[2])
+       && gregOfRM(insn[2]) == 2) {
+      delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
+      goto decode_success;
+   }
+
+   /* 66 0F D1 = PSRLW by E */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) {
+      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
+      goto decode_success;
+   }
+
+   /* 66 0F F8 = PSUBB */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF8) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "psubb", Iop_Sub8x16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F FA = PSUBD */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFA) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "psubd", Iop_Sub32x4, False );
+      goto decode_success;
+   }
+
+   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
+   /* 0F FB = PSUBQ -- sub 64x1 */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xFB) {
+      do_MMX_preamble();
+      delta = dis_MMXop_regmem_to_reg ( 
+                sorb, delta+2, insn[1], "psubq", False );
+      goto decode_success;
+   }
+
+   /* 66 0F FB = PSUBQ */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFB) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "psubq", Iop_Sub64x2, False );
+      goto decode_success;
+   }
+
+   /* 66 0F F9 = PSUBW */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF9) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "psubw", Iop_Sub16x8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F E8 = PSUBSB */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE8) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "psubsb", Iop_QSub8Sx16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F E9 = PSUBSW */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE9) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "psubsw", Iop_QSub16Sx8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F D8 = PSUBSB */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD8) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "psubusb", Iop_QSub8Ux16, False );
+      goto decode_success;
+   }
+
+   /* 66 0F D9 = PSUBSW */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD9) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "psubusw", Iop_QSub16Ux8, False );
+      goto decode_success;
+   }
+
+   /* 66 0F 68 = PUNPCKHBW */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x68) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "punpckhbw",
+                                 Iop_InterleaveHI8x16, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 6A = PUNPCKHDQ */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6A) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "punpckhdq",
+                                 Iop_InterleaveHI32x4, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 6D = PUNPCKHQDQ */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6D) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "punpckhqdq",
+                                 Iop_InterleaveHI64x2, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 69 = PUNPCKHWD */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x69) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "punpckhwd",
+                                 Iop_InterleaveHI16x8, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 60 = PUNPCKLBW */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x60) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "punpcklbw",
+                                 Iop_InterleaveLO8x16, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 62 = PUNPCKLDQ */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x62) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "punpckldq",
+                                 Iop_InterleaveLO32x4, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 6C = PUNPCKLQDQ */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6C) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "punpcklqdq",
+                                 Iop_InterleaveLO64x2, True );
+      goto decode_success;
+   }
+
+   /* 66 0F 61 = PUNPCKLWD */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x61) {
+      delta = dis_SSEint_E_to_G( sorb, delta+2, 
+                                 "punpcklwd",
+                                 Iop_InterleaveLO16x8, True );
+      goto decode_success;
+   }
+
+   /* 66 0F EF = PXOR */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEF) {
+      delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
+      goto decode_success;
+   }
+
+//--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
+//--    if (insn[0] == 0x0F && insn[1] == 0xAE 
+//--        && (!epartIsReg(insn[2]))
+//--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
+//--       Bool store = gregOfRM(insn[2]) == 0;
+//--       vg_assert(sz == 4);
+//--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
+//--       t1   = LOW24(pair);
+//--       eip += 2+HI8(pair);
+//--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
+//--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
+//--                   Lit16, (UShort)insn[2],
+//--                   TempReg, t1 );
+//--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
+//--       goto decode_success;
+//--    }
+
+   /* 0F AE /7 = CLFLUSH -- flush cache line */
+   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
+       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
+
+      /* This is something of a hack.  We need to know the size of the
+         cache line containing addr.  Since we don't (easily), assume
+         256 on the basis that no real cache would have a line that
+         big.  It's safe to invalidate more stuff than we need, just
+         inefficient. */
+      UInt lineszB = 256;
+
+      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+      delta += 2+alen;
+
+      /* Round addr down to the start of the containing block. */
+      stmt( IRStmt_Put(
+               OFFB_TISTART,
+               binop( Iop_And32, 
+                      mkexpr(addr), 
+                      mkU32( ~(lineszB-1) ))) );
+
+      stmt( IRStmt_Put(OFFB_TILEN, mkU32(lineszB) ) );
+
+      irsb->jumpkind = Ijk_TInval;
+      irsb->next     = mkU32(guest_EIP_bbstart+delta);
+      dres.whatNext  = Dis_StopHere;
+
+      DIP("clflush %s\n", dis_buf);
+      goto decode_success;
+   }
+
+   /* ---------------------------------------------------- */
+   /* --- end of the SSE2 decoder.                     --- */
+   /* ---------------------------------------------------- */
+
+   /* ---------------------------------------------------- */
+   /* --- start of the SSE3 decoder.                   --- */
+   /* ---------------------------------------------------- */
+
+   /* Skip parts of the decoder which don't apply given the stated
+      guest subarchitecture. */
+   /* if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3)) */
+   /* In fact this is highly bogus; we accept SSE3 insns even on a
+      SSE2-only guest since they turn into IR which can be re-emitted
+      successfully on an SSE2 host. */
+   if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
+      goto after_sse_decoders; /* no SSE3 capabilities */
+
+   insn = (UChar*)&guest_code[delta];
+
+   /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
+      duplicating some lanes (2:2:0:0). */
+   /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
+      duplicating some lanes (3:3:1:1). */
+   if (sz == 4 && insn[0] == 0xF3 && insn[1] == 0x0F 
+       && (insn[2] == 0x12 || insn[2] == 0x16)) {
+      IRTemp s3, s2, s1, s0;
+      IRTemp sV  = newTemp(Ity_V128);
+      Bool   isH = insn[2] == 0x16;
+      s3 = s2 = s1 = s0 = IRTemp_INVALID;
+
+      modrm = insn[3];
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg( eregOfRM(modrm)) );
+         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
+                                  nameXMMReg(eregOfRM(modrm)),
+                                  nameXMMReg(gregOfRM(modrm)));
+         delta += 3+1;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
+	     dis_buf,
+             nameXMMReg(gregOfRM(modrm)));
+         delta += 3+alen;
+      }
+
+      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
+      putXMMReg( gregOfRM(modrm), 
+                 isH ? mk128from32s( s3, s3, s1, s1 )
+                     : mk128from32s( s2, s2, s0, s0 ) );
+      goto decode_success;
+   }
+
+   /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
+      duplicating some lanes (0:1:0:1). */
+   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
+      IRTemp sV = newTemp(Ity_V128);
+      IRTemp d0 = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg( eregOfRM(modrm)) );
+         DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                nameXMMReg(gregOfRM(modrm)));
+         delta += 3+1;
+         assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
+         DIP("movddup %s,%s\n", dis_buf,
+                                nameXMMReg(gregOfRM(modrm)));
+         delta += 3+alen;
+      }
+
+      putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
+      goto decode_success;
+   }
+
+   /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
+   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
+      IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
+      IRTemp eV   = newTemp(Ity_V128);
+      IRTemp gV   = newTemp(Ity_V128);
+      IRTemp addV = newTemp(Ity_V128);
+      IRTemp subV = newTemp(Ity_V128);
+      a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
+
+      modrm = insn[3];
+      if (epartIsReg(modrm)) {
+         assign( eV, getXMMReg( eregOfRM(modrm)) );
+         DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+         delta += 3+1;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("addsubps %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)));
+         delta += 3+alen;
+      }
+
+      assign( gV, getXMMReg(gregOfRM(modrm)) );
+
+      assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
+      assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
+
+      breakup128to32s( addV, &a3, &a2, &a1, &a0 );
+      breakup128to32s( subV, &s3, &s2, &s1, &s0 );
+
+      putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
+      goto decode_success;
+   }
+
+   /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
+   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD0) {
+      IRTemp eV   = newTemp(Ity_V128);
+      IRTemp gV   = newTemp(Ity_V128);
+      IRTemp addV = newTemp(Ity_V128);
+      IRTemp subV = newTemp(Ity_V128);
+      IRTemp a1     = newTemp(Ity_I64);
+      IRTemp s0     = newTemp(Ity_I64);
+
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( eV, getXMMReg( eregOfRM(modrm)) );
+         DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("addsubpd %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)));
+         delta += 2+alen;
+      }
+
+      assign( gV, getXMMReg(gregOfRM(modrm)) );
+
+      assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
+      assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
+
+      assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
+      assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
+
+      putXMMReg( gregOfRM(modrm), 
+                 binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
+      goto decode_success;
+   }
+
+   /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
+   /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
+   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F 
+       && (insn[2] == 0x7C || insn[2] == 0x7D)) {
+      IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
+      IRTemp eV     = newTemp(Ity_V128);
+      IRTemp gV     = newTemp(Ity_V128);
+      IRTemp leftV  = newTemp(Ity_V128);
+      IRTemp rightV = newTemp(Ity_V128);
+      Bool   isAdd  = insn[2] == 0x7C;
+      HChar* str    = isAdd ? "add" : "sub";
+      e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
+
+      modrm = insn[3];
+      if (epartIsReg(modrm)) {
+         assign( eV, getXMMReg( eregOfRM(modrm)) );
+         DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
+                                   nameXMMReg(gregOfRM(modrm)));
+         delta += 3+1;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("h%sps %s,%s\n", str, dis_buf,
+                                   nameXMMReg(gregOfRM(modrm)));
+         delta += 3+alen;
+      }
+
+      assign( gV, getXMMReg(gregOfRM(modrm)) );
+
+      breakup128to32s( eV, &e3, &e2, &e1, &e0 );
+      breakup128to32s( gV, &g3, &g2, &g1, &g0 );
+
+      assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
+      assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
+
+      putXMMReg( gregOfRM(modrm), 
+                 binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4, 
+                       mkexpr(leftV), mkexpr(rightV) ) );
+      goto decode_success;
+   }
+
+   /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
+   /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
+   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
+      IRTemp e1     = newTemp(Ity_I64);
+      IRTemp e0     = newTemp(Ity_I64);
+      IRTemp g1     = newTemp(Ity_I64);
+      IRTemp g0     = newTemp(Ity_I64);
+      IRTemp eV     = newTemp(Ity_V128);
+      IRTemp gV     = newTemp(Ity_V128);
+      IRTemp leftV  = newTemp(Ity_V128);
+      IRTemp rightV = newTemp(Ity_V128);
+      Bool   isAdd  = insn[1] == 0x7C;
+      HChar* str    = isAdd ? "add" : "sub";
+
+      modrm = insn[2];
+      if (epartIsReg(modrm)) {
+         assign( eV, getXMMReg( eregOfRM(modrm)) );
+         DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
+                                   nameXMMReg(gregOfRM(modrm)));
+         delta += 2+1;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
+         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("h%spd %s,%s\n", str, dis_buf,
+                              nameXMMReg(gregOfRM(modrm)));
+         delta += 2+alen;
+      }
+
+      assign( gV, getXMMReg(gregOfRM(modrm)) );
+
+      assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
+      assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
+      assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
+      assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
+
+      assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
+      assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
+
+      putXMMReg( gregOfRM(modrm), 
+                 binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2, 
+                       mkexpr(leftV), mkexpr(rightV) ) );
+      goto decode_success;
+   }
+
+   /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
+   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
+      modrm = getIByte(delta+3);
+      if (epartIsReg(modrm)) {
+         goto decode_failure;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         putXMMReg( gregOfRM(modrm), 
+                    loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("lddqu %s,%s\n", dis_buf,
+                              nameXMMReg(gregOfRM(modrm)));
+         delta += 3+alen;
+      }
+      goto decode_success;
+   }
+
+   /* ---------------------------------------------------- */
+   /* --- end of the SSE3 decoder.                     --- */
+   /* ---------------------------------------------------- */
+
+   /* ---------------------------------------------------- */
+   /* --- start of the SSSE3 decoder.                  --- */
+   /* ---------------------------------------------------- */
+
+   /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
+      Unsigned Bytes (MMX) */
+   if (sz == 4
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
+      IRTemp sV        = newTemp(Ity_I64);
+      IRTemp dV        = newTemp(Ity_I64);
+      IRTemp sVoddsSX  = newTemp(Ity_I64);
+      IRTemp sVevensSX = newTemp(Ity_I64);
+      IRTemp dVoddsZX  = newTemp(Ity_I64);
+      IRTemp dVevensZX = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
+                                  nameMMXReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pmaddubsw %s,%s\n", dis_buf,
+                                  nameMMXReg(gregOfRM(modrm)));
+      }
+
+      /* compute dV unsigned x sV signed */
+      assign( sVoddsSX,
+              binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
+      assign( sVevensSX,
+              binop(Iop_SarN16x4, 
+                    binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)), 
+                    mkU8(8)) );
+      assign( dVoddsZX,
+              binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
+      assign( dVevensZX,
+              binop(Iop_ShrN16x4,
+                    binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
+                    mkU8(8)) );
+
+      putMMXReg(
+         gregOfRM(modrm),
+         binop(Iop_QAdd16Sx4,
+               binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
+               binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
+      Unsigned Bytes (XMM) */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
+      IRTemp sV        = newTemp(Ity_V128);
+      IRTemp dV        = newTemp(Ity_V128);
+      IRTemp sVoddsSX  = newTemp(Ity_V128);
+      IRTemp sVevensSX = newTemp(Ity_V128);
+      IRTemp dVoddsZX  = newTemp(Ity_V128);
+      IRTemp dVevensZX = newTemp(Ity_V128);
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                  nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pmaddubsw %s,%s\n", dis_buf,
+                                  nameXMMReg(gregOfRM(modrm)));
+      }
+
+      /* compute dV unsigned x sV signed */
+      assign( sVoddsSX,
+              binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
+      assign( sVevensSX,
+              binop(Iop_SarN16x8, 
+                    binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)), 
+                    mkU8(8)) );
+      assign( dVoddsZX,
+              binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
+      assign( dVevensZX,
+              binop(Iop_ShrN16x8,
+                    binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
+                    mkU8(8)) );
+
+      putXMMReg(
+         gregOfRM(modrm),
+         binop(Iop_QAdd16Sx8,
+               binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
+               binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
+         )
+      );
+      goto decode_success;
+   }
+
+   /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
+   /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
+      mmx) and G to G (mmx). */
+   /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
+      mmx) and G to G (mmx). */
+   /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
+      to G (mmx). */
+   /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
+      to G (mmx). */
+   /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
+      to G (mmx). */
+   /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
+      to G (mmx). */
+
+   if (sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
+           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
+      HChar* str    = "???";
+      IROp   opV64  = Iop_INVALID;
+      IROp   opCatO = Iop_CatOddLanes16x4;
+      IROp   opCatE = Iop_CatEvenLanes16x4;
+      IRTemp sV     = newTemp(Ity_I64);
+      IRTemp dV     = newTemp(Ity_I64);
+
+      modrm = insn[3];
+
+      switch (insn[2]) {
+         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
+         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
+         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
+         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
+         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
+         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
+         default: vassert(0);
+      }
+      if (insn[2] == 0x02 || insn[2] == 0x06) {
+         opCatO = Iop_InterleaveHI32x2;
+         opCatE = Iop_InterleaveLO32x2;
+      }
+
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
+                                  nameMMXReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("ph%s %s,%s\n", str, dis_buf,
+                                  nameMMXReg(gregOfRM(modrm)));
+      }
+
+      putMMXReg(
+         gregOfRM(modrm),
+         binop(opV64,
+               binop(opCatE,mkexpr(sV),mkexpr(dV)),
+               binop(opCatO,mkexpr(sV),mkexpr(dV))
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
+      xmm) and G to G (xmm). */
+   /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
+      xmm) and G to G (xmm). */
+   /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
+      G to G (xmm). */
+   /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
+      G to G (xmm). */
+   /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
+      G to G (xmm). */
+   /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
+      G to G (xmm). */
+
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
+           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
+      HChar* str    = "???";
+      IROp   opV64  = Iop_INVALID;
+      IROp   opCatO = Iop_CatOddLanes16x4;
+      IROp   opCatE = Iop_CatEvenLanes16x4;
+      IRTemp sV     = newTemp(Ity_V128);
+      IRTemp dV     = newTemp(Ity_V128);
+      IRTemp sHi    = newTemp(Ity_I64);
+      IRTemp sLo    = newTemp(Ity_I64);
+      IRTemp dHi    = newTemp(Ity_I64);
+      IRTemp dLo    = newTemp(Ity_I64);
+
+      modrm = insn[3];
+
+      switch (insn[2]) {
+         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
+         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
+         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
+         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
+         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
+         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
+         default: vassert(0);
+      }
+      if (insn[2] == 0x02 || insn[2] == 0x06) {
+         opCatO = Iop_InterleaveHI32x2;
+         opCatE = Iop_InterleaveLO32x2;
+      }
+
+      assign( dV, getXMMReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg( eregOfRM(modrm)) );
+         DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
+                                  nameXMMReg(gregOfRM(modrm)));
+         delta += 3+1;
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         DIP("ph%s %s,%s\n", str, dis_buf,
+                             nameXMMReg(gregOfRM(modrm)));
+         delta += 3+alen;
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      /* This isn't a particularly efficient way to compute the
+         result, but at least it avoids a proliferation of IROps,
+         hence avoids complication all the backends. */
+      putXMMReg(
+         gregOfRM(modrm), 
+         binop(Iop_64HLtoV128,
+               binop(opV64,
+                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
+                     binop(opCatO,mkexpr(sHi),mkexpr(sLo))
+               ),
+               binop(opV64,
+                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
+                     binop(opCatO,mkexpr(dHi),mkexpr(dLo))
+               )
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
+      (MMX) */
+   if (sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
+      IRTemp sV = newTemp(Ity_I64);
+      IRTemp dV = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
+                                 nameMMXReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pmulhrsw %s,%s\n", dis_buf,
+                                 nameMMXReg(gregOfRM(modrm)));
+      }
+
+      putMMXReg(
+         gregOfRM(modrm),
+         dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
+      Scale (XMM) */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
+      IRTemp sV  = newTemp(Ity_V128);
+      IRTemp dV  = newTemp(Ity_V128);
+      IRTemp sHi = newTemp(Ity_I64);
+      IRTemp sLo = newTemp(Ity_I64);
+      IRTemp dHi = newTemp(Ity_I64);
+      IRTemp dLo = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                                 nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pmulhrsw %s,%s\n", dis_buf,
+                                 nameXMMReg(gregOfRM(modrm)));
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      putXMMReg(
+         gregOfRM(modrm),
+         binop(Iop_64HLtoV128,
+               dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
+               dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
+   /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
+   /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
+   if (sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
+      IRTemp sV      = newTemp(Ity_I64);
+      IRTemp dV      = newTemp(Ity_I64);
+      HChar* str     = "???";
+      Int    laneszB = 0;
+
+      switch (insn[2]) {
+         case 0x08: laneszB = 1; str = "b"; break;
+         case 0x09: laneszB = 2; str = "w"; break;
+         case 0x0A: laneszB = 4; str = "d"; break;
+         default: vassert(0);
+      }
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
+                                     nameMMXReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("psign%s %s,%s\n", str, dis_buf,
+                                     nameMMXReg(gregOfRM(modrm)));
+      }
+
+      putMMXReg(
+         gregOfRM(modrm),
+         dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
+   /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
+   /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
+      IRTemp sV      = newTemp(Ity_V128);
+      IRTemp dV      = newTemp(Ity_V128);
+      IRTemp sHi     = newTemp(Ity_I64);
+      IRTemp sLo     = newTemp(Ity_I64);
+      IRTemp dHi     = newTemp(Ity_I64);
+      IRTemp dLo     = newTemp(Ity_I64);
+      HChar* str     = "???";
+      Int    laneszB = 0;
+
+      switch (insn[2]) {
+         case 0x08: laneszB = 1; str = "b"; break;
+         case 0x09: laneszB = 2; str = "w"; break;
+         case 0x0A: laneszB = 4; str = "d"; break;
+         default: vassert(0);
+      }
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
+                                     nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("psign%s %s,%s\n", str, dis_buf,
+                                     nameXMMReg(gregOfRM(modrm)));
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      putXMMReg(
+         gregOfRM(modrm),
+         binop(Iop_64HLtoV128,
+               dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
+               dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
+   /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
+   /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
+   if (sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
+      IRTemp sV      = newTemp(Ity_I64);
+      HChar* str     = "???";
+      Int    laneszB = 0;
+
+      switch (insn[2]) {
+         case 0x1C: laneszB = 1; str = "b"; break;
+         case 0x1D: laneszB = 2; str = "w"; break;
+         case 0x1E: laneszB = 4; str = "d"; break;
+         default: vassert(0);
+      }
+
+      modrm = insn[3];
+      do_MMX_preamble();
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
+                                    nameMMXReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pabs%s %s,%s\n", str, dis_buf,
+                                    nameMMXReg(gregOfRM(modrm)));
+      }
+
+      putMMXReg(
+         gregOfRM(modrm),
+         dis_PABS_helper( mkexpr(sV), laneszB )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
+   /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
+   /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0x38 
+       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
+      IRTemp sV      = newTemp(Ity_V128);
+      IRTemp sHi     = newTemp(Ity_I64);
+      IRTemp sLo     = newTemp(Ity_I64);
+      HChar* str     = "???";
+      Int    laneszB = 0;
+
+      switch (insn[2]) {
+         case 0x1C: laneszB = 1; str = "b"; break;
+         case 0x1D: laneszB = 2; str = "w"; break;
+         case 0x1E: laneszB = 4; str = "d"; break;
+         default: vassert(0);
+      }
+
+      modrm = insn[3];
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
+                                    nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pabs%s %s,%s\n", str, dis_buf,
+                                    nameXMMReg(gregOfRM(modrm)));
+      }
+
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      putXMMReg(
+         gregOfRM(modrm),
+         binop(Iop_64HLtoV128,
+               dis_PABS_helper( mkexpr(sHi), laneszB ),
+               dis_PABS_helper( mkexpr(sLo), laneszB )
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
+   if (sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
+      IRTemp sV  = newTemp(Ity_I64);
+      IRTemp dV  = newTemp(Ity_I64);
+      IRTemp res = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregOfRM(modrm)) );
+         d32 = (UInt)insn[3+1];
+         delta += 3+1+1;
+         DIP("palignr $%d,%s,%s\n",  (Int)d32, 
+                                     nameMMXReg(eregOfRM(modrm)),
+                                     nameMMXReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         d32 = (UInt)insn[3+alen];
+         delta += 3+alen+1;
+         DIP("palignr $%d%s,%s\n", (Int)d32,
+                                   dis_buf,
+                                   nameMMXReg(gregOfRM(modrm)));
+      }
+
+      if (d32 == 0) {
+         assign( res, mkexpr(sV) );
+      }
+      else if (d32 >= 1 && d32 <= 7) {
+         assign(res, 
+                binop(Iop_Or64,
+                      binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
+                      binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
+                     )));
+      }
+      else if (d32 == 8) {
+        assign( res, mkexpr(dV) );
+      }
+      else if (d32 >= 9 && d32 <= 15) {
+         assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
+      }
+      else if (d32 >= 16 && d32 <= 255) {
+         assign( res, mkU64(0) );
+      }
+      else
+         vassert(0);
+
+      putMMXReg( gregOfRM(modrm), mkexpr(res) );
+      goto decode_success;
+   }
+
+   /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
+      IRTemp sV  = newTemp(Ity_V128);
+      IRTemp dV  = newTemp(Ity_V128);
+      IRTemp sHi = newTemp(Ity_I64);
+      IRTemp sLo = newTemp(Ity_I64);
+      IRTemp dHi = newTemp(Ity_I64);
+      IRTemp dLo = newTemp(Ity_I64);
+      IRTemp rHi = newTemp(Ity_I64);
+      IRTemp rLo = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         d32 = (UInt)insn[3+1];
+         delta += 3+1+1;
+         DIP("palignr $%d,%s,%s\n", (Int)d32,
+                                    nameXMMReg(eregOfRM(modrm)),
+                                    nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         d32 = (UInt)insn[3+alen];
+         delta += 3+alen+1;
+         DIP("palignr $%d,%s,%s\n", (Int)d32,
+                                    dis_buf,
+                                    nameXMMReg(gregOfRM(modrm)));
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      if (d32 == 0) {
+         assign( rHi, mkexpr(sHi) );
+         assign( rLo, mkexpr(sLo) );
+      }
+      else if (d32 >= 1 && d32 <= 7) {
+         assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
+         assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
+      }
+      else if (d32 == 8) {
+         assign( rHi, mkexpr(dLo) );
+         assign( rLo, mkexpr(sHi) );
+      }
+      else if (d32 >= 9 && d32 <= 15) {
+         assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
+         assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
+      }
+      else if (d32 == 16) {
+         assign( rHi, mkexpr(dHi) );
+         assign( rLo, mkexpr(dLo) );
+      }
+      else if (d32 >= 17 && d32 <= 23) {
+         assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
+         assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
+      }
+      else if (d32 == 24) {
+         assign( rHi, mkU64(0) );
+         assign( rLo, mkexpr(dHi) );
+      }
+      else if (d32 >= 25 && d32 <= 31) {
+         assign( rHi, mkU64(0) );
+         assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
+      }
+      else if (d32 >= 32 && d32 <= 255) {
+         assign( rHi, mkU64(0) );
+         assign( rLo, mkU64(0) );
+      }
+      else
+         vassert(0);
+
+      putXMMReg(
+         gregOfRM(modrm),
+         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
+      );
+      goto decode_success;
+   }
+
+   /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
+   if (sz == 4 
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
+      IRTemp sV      = newTemp(Ity_I64);
+      IRTemp dV      = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      do_MMX_preamble();
+      assign( dV, getMMXReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getMMXReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
+                               nameMMXReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pshufb %s,%s\n", dis_buf,
+                               nameMMXReg(gregOfRM(modrm)));
+      }
+
+      putMMXReg(
+         gregOfRM(modrm),
+         binop(
+            Iop_And64,
+            /* permute the lanes */
+            binop(
+               Iop_Perm8x8,
+               mkexpr(dV),
+               binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
+            ),
+            /* mask off lanes which have (index & 0x80) == 0x80 */
+            unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
+         )
+      );
+      goto decode_success;
+   }
+
+   /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
+   if (sz == 2
+       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
+      IRTemp sV         = newTemp(Ity_V128);
+      IRTemp dV         = newTemp(Ity_V128);
+      IRTemp sHi        = newTemp(Ity_I64);
+      IRTemp sLo        = newTemp(Ity_I64);
+      IRTemp dHi        = newTemp(Ity_I64);
+      IRTemp dLo        = newTemp(Ity_I64);
+      IRTemp rHi        = newTemp(Ity_I64);
+      IRTemp rLo        = newTemp(Ity_I64);
+      IRTemp sevens     = newTemp(Ity_I64);
+      IRTemp mask0x80hi = newTemp(Ity_I64);
+      IRTemp mask0x80lo = newTemp(Ity_I64);
+      IRTemp maskBit3hi = newTemp(Ity_I64);
+      IRTemp maskBit3lo = newTemp(Ity_I64);
+      IRTemp sAnd7hi    = newTemp(Ity_I64);
+      IRTemp sAnd7lo    = newTemp(Ity_I64);
+      IRTemp permdHi    = newTemp(Ity_I64);
+      IRTemp permdLo    = newTemp(Ity_I64);
+
+      modrm = insn[3];
+      assign( dV, getXMMReg(gregOfRM(modrm)) );
+
+      if (epartIsReg(modrm)) {
+         assign( sV, getXMMReg(eregOfRM(modrm)) );
+         delta += 3+1;
+         DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
+                               nameXMMReg(gregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
+         gen_SEGV_if_not_16_aligned( addr );
+         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
+         delta += 3+alen;
+         DIP("pshufb %s,%s\n", dis_buf,
+                               nameXMMReg(gregOfRM(modrm)));
+      }
+
+      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
+      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
+      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
+      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
+
+      assign( sevens, mkU64(0x0707070707070707ULL) );
+
+      /*
+      mask0x80hi = Not(SarN8x8(sHi,7))
+      maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
+      sAnd7hi    = And(sHi,sevens)
+      permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
+                       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
+      rHi        = And(permdHi,mask0x80hi)
+      */
+      assign(
+         mask0x80hi,
+         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
+
+      assign(
+         maskBit3hi,
+         binop(Iop_SarN8x8,
+               binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
+               mkU8(7)));
+
+      assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
+
+      assign(
+         permdHi,
+         binop(
+            Iop_Or64,
+            binop(Iop_And64,
+                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
+                  mkexpr(maskBit3hi)),
+            binop(Iop_And64,
+                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
+                  unop(Iop_Not64,mkexpr(maskBit3hi))) ));
+
+      assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
+
+      /* And the same for the lower half of the result.  What fun. */
+
+      assign(
+         mask0x80lo,
+         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
+
+      assign(
+         maskBit3lo,
+         binop(Iop_SarN8x8,
+               binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
+               mkU8(7)));
+
+      assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
+
+      assign(
+         permdLo,
+         binop(
+            Iop_Or64,
+            binop(Iop_And64,
+                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
+                  mkexpr(maskBit3lo)),
+            binop(Iop_And64,
+                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
+                  unop(Iop_Not64,mkexpr(maskBit3lo))) ));
+
+      assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
+
+      putXMMReg(
+         gregOfRM(modrm),
+         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
+      );
+      goto decode_success;
+   }
+
+   /* ---------------------------------------------------- */
+   /* --- end of the SSSE3 decoder.                    --- */
+   /* ---------------------------------------------------- */
+
+   /* ---------------------------------------------------- */
+   /* --- start of the SSE4 decoder                    --- */
+   /* ---------------------------------------------------- */
+
+   /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
+      (Partial implementation only -- only deal with cases where
+      the rounding mode is specified directly by the immediate byte.)
+      66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
+      (Limitations ditto)
+   */
+   if (sz == 2 
+       && insn[0] == 0x0F && insn[1] == 0x3A
+       && (/*insn[2] == 0x0B || */insn[2] == 0x0A)) {
+
+      Bool   isD = insn[2] == 0x0B;
+      IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
+      IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
+      Int    imm = 0;
+
+      modrm = insn[3];
+
+      if (epartIsReg(modrm)) {
+         assign( src, 
+                 isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
+                     : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
+         imm = insn[3+1];
+         if (imm & ~3) goto decode_failure;
+         delta += 3+1+1;
+         DIP( "rounds%c $%d,%s,%s\n",
+              isD ? 'd' : 's',
+              imm, nameXMMReg( eregOfRM(modrm) ),
+                   nameXMMReg( gregOfRM(modrm) ) );
+      } else {
+         addr = disAMode( &alen, sorb, delta+3, dis_buf );
+         assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
+         imm = insn[3+alen];
+         if (imm & ~3) goto decode_failure;
+         delta += 3+alen+1;
+         DIP( "roundsd $%d,%s,%s\n",
+              imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
+      }
+
+      /* (imm & 3) contains an Intel-encoded rounding mode.  Because
+         that encoding is the same as the encoding for IRRoundingMode,
+         we can use that value directly in the IR as a rounding
+         mode. */
+      assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
+                  mkU32(imm & 3), mkexpr(src)) );
+
+      if (isD)
+         putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
+      else
+         putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
+
+      goto decode_success;
+   }
+
+   /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
+      which we can only decode if we're sure this is an AMD cpu that
+      supports LZCNT, since otherwise it's BSR, which behaves
+      differently. */
+   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
+       && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
+      vassert(sz == 2 || sz == 4);
+      /*IRType*/ ty  = szToITy(sz);
+      IRTemp     src = newTemp(ty);
+      modrm = insn[3];
+      if (epartIsReg(modrm)) {
+         assign(src, getIReg(sz, eregOfRM(modrm)));
+         delta += 3+1;
+         DIP("lzcnt%c %s, %s\n", nameISize(sz),
+             nameIReg(sz, eregOfRM(modrm)),
+             nameIReg(sz, gregOfRM(modrm)));
+      } else {
+         addr = disAMode( &alen, sorb, delta+3, dis_buf );
+         assign(src, loadLE(ty, mkexpr(addr)));
+         delta += 3+alen;
+         DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
+             nameIReg(sz, gregOfRM(modrm)));
+      }
+
+      IRTemp res = gen_LZCNT(ty, src);
+      putIReg(sz, gregOfRM(modrm), mkexpr(res));
+
+      // Update flags.  This is pretty lame .. perhaps can do better
+      // if this turns out to be performance critical.
+      // O S A P are cleared.  Z is set if RESULT == 0.
+      // C is set if SRC is zero.
+      IRTemp src32 = newTemp(Ity_I32);
+      IRTemp res32 = newTemp(Ity_I32);
+      assign(src32, widenUto32(mkexpr(src)));
+      assign(res32, widenUto32(mkexpr(res)));
+
+      IRTemp oszacp = newTemp(Ity_I32);
+      assign(
+         oszacp,
+         binop(Iop_Or32,
+               binop(Iop_Shl32,
+                     unop(Iop_1Uto32,
+                          binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
+                     mkU8(X86G_CC_SHIFT_Z)),
+               binop(Iop_Shl32,
+                     unop(Iop_1Uto32,
+                          binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
+                     mkU8(X86G_CC_SHIFT_C))
+         )
+      );
+
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
+
+      goto decode_success;
+   }
+
+   /* ---------------------------------------------------- */
+   /* --- end of the SSE4 decoder                      --- */
+   /* ---------------------------------------------------- */
+
+   after_sse_decoders:
+
+   /* ---------------------------------------------------- */
+   /* --- deal with misc 0x67 pfxs (addr size override) -- */
+   /* ---------------------------------------------------- */
+
+   /* 67 E3 = JCXZ (for JECXZ see below) */
+   if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
+      delta += 2;
+      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
+      delta ++;
+      stmt( IRStmt_Exit(
+               binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
+               Ijk_Boring,
+               IRConst_U32(d32)
+            ));
+       DIP("jcxz 0x%x\n", d32);
+       goto decode_success;
+   }
+
+   /* ---------------------------------------------------- */
+   /* --- start of the baseline insn decoder            -- */
+   /* ---------------------------------------------------- */
+
+   /* Get the primary opcode. */
+   opc = getIByte(delta); delta++;
+
+   /* We get here if the current insn isn't SSE, or this CPU doesn't
+      support SSE. */
+
+   switch (opc) {
+
+   /* ------------------------ Control flow --------------- */
+
+   case 0xC2: /* RET imm16 */
+      d32 = getUDisp16(delta); 
+      delta += 2;
+      dis_ret(d32);
+      dres.whatNext = Dis_StopHere;
+      DIP("ret %d\n", (Int)d32);
+      break;
+   case 0xC3: /* RET */
+      dis_ret(0);
+      dres.whatNext = Dis_StopHere;
+      DIP("ret\n");
+      break;
+
+   case 0xCF: /* IRET */
+      /* Note, this is an extremely kludgey and limited implementation
+         of iret.  All it really does is: 
+            popl %EIP; popl %CS; popl %EFLAGS.
+         %CS is set but ignored (as it is in (eg) popw %cs)". */
+      t1 = newTemp(Ity_I32); /* ESP */
+      t2 = newTemp(Ity_I32); /* new EIP */
+      t3 = newTemp(Ity_I32); /* new CS */
+      t4 = newTemp(Ity_I32); /* new EFLAGS */
+      assign(t1, getIReg(4,R_ESP));
+      assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
+      assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
+      assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
+      /* Get stuff off stack */
+      putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
+      /* set %CS (which is ignored anyway) */
+      putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
+      /* set %EFLAGS */
+      set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
+      /* goto new EIP value */
+      jmp_treg(Ijk_Ret,t2);
+      dres.whatNext = Dis_StopHere;
+      DIP("iret (very kludgey)\n");
+      break;
+
+   case 0xE8: /* CALL J4 */
+      d32 = getUDisp32(delta); delta += 4;
+      d32 += (guest_EIP_bbstart+delta); 
+      /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
+      if (d32 == guest_EIP_bbstart+delta && getIByte(delta) >= 0x58 
+                                         && getIByte(delta) <= 0x5F) {
+         /* Specially treat the position-independent-code idiom 
+                 call X
+              X: popl %reg
+            as 
+                 movl %eip, %reg.
+            since this generates better code, but for no other reason. */
+         Int archReg = getIByte(delta) - 0x58;
+         /* vex_printf("-- fPIC thingy\n"); */
+         putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
+         delta++; /* Step over the POP */
+         DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
+      } else {
+         /* The normal sequence for a call. */
+         t1 = newTemp(Ity_I32); 
+         assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
+         putIReg(4, R_ESP, mkexpr(t1));
+         storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
+         if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32 )) {
+            /* follow into the call target. */
+            dres.whatNext   = Dis_ResteerU;
+            dres.continueAt = (Addr64)(Addr32)d32;
+         } else {
+            jmp_lit(Ijk_Call,d32);
+            dres.whatNext = Dis_StopHere;
+         }
+         DIP("call 0x%x\n",d32);
+      }
+      break;
+
+//--    case 0xC8: /* ENTER */ 
+//--       d32 = getUDisp16(eip); eip += 2;
+//--       abyte = getIByte(delta); delta++;
+//-- 
+//--       vg_assert(sz == 4);           
+//--       vg_assert(abyte == 0);
+//-- 
+//--       t1 = newTemp(cb); t2 = newTemp(cb);
+//--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
+//--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
+//--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
+//--       uLiteral(cb, sz);
+//--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
+//--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
+//--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
+//--       if (d32) {
+//--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
+//--          uLiteral(cb, d32);
+//--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
+//--       }
+//--       DIP("enter 0x%x, 0x%x", d32, abyte);
+//--       break;
+
+   case 0xC9: /* LEAVE */
+      vassert(sz == 4);
+      t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
+      assign(t1, getIReg(4,R_EBP));
+      /* First PUT ESP looks redundant, but need it because ESP must
+         always be up-to-date for Memcheck to work... */
+      putIReg(4, R_ESP, mkexpr(t1));
+      assign(t2, loadLE(Ity_I32,mkexpr(t1)));
+      putIReg(4, R_EBP, mkexpr(t2));
+      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
+      DIP("leave\n");
+      break;
+
+   /* ---------------- Misc weird-ass insns --------------- */
+
+   case 0x27: /* DAA */
+   case 0x2F: /* DAS */
+   case 0x37: /* AAA */
+   case 0x3F: /* AAS */
+      /* An ugly implementation for some ugly instructions.  Oh
+	 well. */
+      if (sz != 4) goto decode_failure;
+      t1 = newTemp(Ity_I32);
+      t2 = newTemp(Ity_I32);
+      /* Make up a 32-bit value (t1), with the old value of AX in the
+         bottom 16 bits, and the old OSZACP bitmask in the upper 16
+         bits. */
+      assign(t1, 
+             binop(Iop_16HLto32, 
+                   unop(Iop_32to16,
+                        mk_x86g_calculate_eflags_all()),
+                   getIReg(2, R_EAX)
+            ));
+      /* Call the helper fn, to get a new AX and OSZACP value, and
+         poke both back into the guest state.  Also pass the helper
+         the actual opcode so it knows which of the 4 instructions it
+         is doing the computation for. */
+      vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
+      assign(t2,
+              mkIRExprCCall(
+                 Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
+                 &x86g_calculate_daa_das_aaa_aas,
+                 mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
+            ));
+     putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
+
+     stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+     stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+     stmt( IRStmt_Put( OFFB_CC_DEP1, 
+                       binop(Iop_And32,
+                             binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
+                             mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P 
+                                    | X86G_CC_MASK_A | X86G_CC_MASK_Z 
+                                    | X86G_CC_MASK_S| X86G_CC_MASK_O )
+                            )
+                      )
+         );
+     /* Set NDEP even though it isn't used.  This makes redundant-PUT
+        elimination of previous stores to this field work better. */
+     stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+     switch (opc) {
+        case 0x27: DIP("daa\n"); break;
+        case 0x2F: DIP("das\n"); break;
+        case 0x37: DIP("aaa\n"); break;
+        case 0x3F: DIP("aas\n"); break;
+        default: vassert(0);
+     }
+     break;
+
+//--    case 0xD4: /* AAM */
+//--    case 0xD5: /* AAD */
+//--       d32 = getIByte(delta); delta++;
+//--       if (d32 != 10) VG_(core_panic)("disInstr: AAM/AAD but base not 10 !");
+//--       t1 = newTemp(cb);
+//--       uInstr2(cb, GET, 2, ArchReg, R_EAX, TempReg, t1);
+//--       /* Widen %AX to 32 bits, so it's all defined when we push it. */
+//--       uInstr1(cb, WIDEN, 4, TempReg, t1);
+//--       uWiden(cb, 2, False);
+//--       uInstr0(cb, CALLM_S, 0);
+//--       uInstr1(cb, PUSH, 4, TempReg, t1);
+//--       uInstr1(cb, CALLM, 0, Lit16, 
+//--                   opc == 0xD4 ? VGOFF_(helper_AAM) : VGOFF_(helper_AAD) );
+//--       uFlagsRWU(cb, FlagsEmpty, FlagsSZP, FlagsEmpty);
+//--       uInstr1(cb, POP, 4, TempReg, t1);
+//--       uInstr0(cb, CALLM_E, 0);
+//--       uInstr2(cb, PUT, 2, TempReg, t1, ArchReg, R_EAX);
+//--       DIP(opc == 0xD4 ? "aam\n" : "aad\n");
+//--       break;
+
+   /* ------------------------ CWD/CDQ -------------------- */
+
+   case 0x98: /* CBW */
+      if (sz == 4) {
+         putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
+         DIP("cwde\n");
+      } else {
+         vassert(sz == 2);
+         putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
+         DIP("cbw\n");
+      }
+      break;
+
+   case 0x99: /* CWD/CDQ */
+      ty = szToITy(sz);
+      putIReg(sz, R_EDX,
+                  binop(mkSizedOp(ty,Iop_Sar8), 
+                        getIReg(sz, R_EAX),
+                        mkU8(sz == 2 ? 15 : 31)) );
+      DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
+      break;
+
+   /* ------------------------ FPU ops -------------------- */
+
+   case 0x9E: /* SAHF */
+      codegen_SAHF();
+      DIP("sahf\n");
+      break;
+
+   case 0x9F: /* LAHF */
+      codegen_LAHF();
+      DIP("lahf\n");
+      break;
+
+   case 0x9B: /* FWAIT */
+      /* ignore? */
+      DIP("fwait\n");
+      break;
+
+   case 0xD8:
+   case 0xD9:
+   case 0xDA:
+   case 0xDB:
+   case 0xDC:
+   case 0xDD:
+   case 0xDE:
+   case 0xDF: {
+      Int  delta0    = delta;
+      Bool decode_OK = False;
+      delta = dis_FPU ( &decode_OK, sorb, delta );
+      if (!decode_OK) {
+         delta = delta0;
+         goto decode_failure;
+      }
+      break;
+   }
+
+   /* ------------------------ INC & DEC ------------------ */
+
+   case 0x40: /* INC eAX */
+   case 0x41: /* INC eCX */
+   case 0x42: /* INC eDX */
+   case 0x43: /* INC eBX */
+   case 0x44: /* INC eSP */
+   case 0x45: /* INC eBP */
+   case 0x46: /* INC eSI */
+   case 0x47: /* INC eDI */
+      vassert(sz == 2 || sz == 4);
+      ty = szToITy(sz);
+      t1 = newTemp(ty);
+      assign( t1, binop(mkSizedOp(ty,Iop_Add8),
+                        getIReg(sz, (UInt)(opc - 0x40)),
+                        mkU(ty,1)) );
+      setFlags_INC_DEC( True, t1, ty );
+      putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
+      DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
+      break;
+
+   case 0x48: /* DEC eAX */
+   case 0x49: /* DEC eCX */
+   case 0x4A: /* DEC eDX */
+   case 0x4B: /* DEC eBX */
+   case 0x4C: /* DEC eSP */
+   case 0x4D: /* DEC eBP */
+   case 0x4E: /* DEC eSI */
+   case 0x4F: /* DEC eDI */
+      vassert(sz == 2 || sz == 4);
+      ty = szToITy(sz);
+      t1 = newTemp(ty);
+      assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
+                        getIReg(sz, (UInt)(opc - 0x48)),
+                        mkU(ty,1)) );
+      setFlags_INC_DEC( False, t1, ty );
+      putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
+      DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
+      break;
+
+   /* ------------------------ INT ------------------------ */
+
+   case 0xCC: /* INT 3 */
+      jmp_lit(Ijk_SigTRAP,((Addr32)guest_EIP_bbstart)+delta);
+      dres.whatNext = Dis_StopHere;
+      DIP("int $0x3\n");
+      break;
+
+   case 0xCD: /* INT imm8 */
+      d32 = getIByte(delta); delta++;
+
+      /* For any of the cases where we emit a jump (that is, for all
+         currently handled cases), it's important that all ArchRegs
+         carry their up-to-date value at this point.  So we declare an
+         end-of-block here, which forces any TempRegs caching ArchRegs
+         to be flushed. */
+
+      /* Handle int $0x40 .. $0x43 by synthesising a segfault and a
+         restart of this instruction (hence the "-2" two lines below,
+         to get the restart EIP to be this instruction.  This is
+         probably Linux-specific and it would be more correct to only
+         do this if the VexAbiInfo says that is what we should do. */
+      if (d32 >= 0x40 && d32 <= 0x43) {
+         jmp_lit(Ijk_SigSEGV,((Addr32)guest_EIP_bbstart)+delta-2);
+         dres.whatNext = Dis_StopHere;
+         DIP("int $0x%x\n", (Int)d32);
+         break;
+      }
+
+      /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
+         (darwin syscalls).  As part of this, note where we are, so we
+         can back up the guest to this point if the syscall needs to
+         be restarted. */
+      if (d32 == 0x80) {
+         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
+                           mkU32(guest_EIP_curr_instr) ) );
+         jmp_lit(Ijk_Sys_int128,((Addr32)guest_EIP_bbstart)+delta);
+         dres.whatNext = Dis_StopHere;
+         DIP("int $0x80\n");
+         break;
+      }
+      if (d32 == 0x81) {
+         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
+                           mkU32(guest_EIP_curr_instr) ) );
+         jmp_lit(Ijk_Sys_int129,((Addr32)guest_EIP_bbstart)+delta);
+         dres.whatNext = Dis_StopHere;
+         DIP("int $0x81\n");
+         break;
+      }
+      if (d32 == 0x82) {
+         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
+                           mkU32(guest_EIP_curr_instr) ) );
+         jmp_lit(Ijk_Sys_int130,((Addr32)guest_EIP_bbstart)+delta);
+         dres.whatNext = Dis_StopHere;
+         DIP("int $0x82\n");
+         break;
+      }
+
+      /* none of the above */
+      goto decode_failure;
+
+   /* ------------------------ Jcond, byte offset --------- */
+
+   case 0xEB: /* Jb (jump, byte offset) */
+      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta); 
+      delta++;
+      if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
+         dres.whatNext   = Dis_ResteerU;
+         dres.continueAt = (Addr64)(Addr32)d32;
+      } else {
+         jmp_lit(Ijk_Boring,d32);
+         dres.whatNext = Dis_StopHere;
+      }
+      DIP("jmp-8 0x%x\n", d32);
+      break;
+
+   case 0xE9: /* Jv (jump, 16/32 offset) */
+      vassert(sz == 4); /* JRS added 2004 July 11 */
+      d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta); 
+      delta += sz;
+      if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
+         dres.whatNext   = Dis_ResteerU;
+         dres.continueAt = (Addr64)(Addr32)d32;
+      } else {
+         jmp_lit(Ijk_Boring,d32);
+         dres.whatNext = Dis_StopHere;
+      }
+      DIP("jmp 0x%x\n", d32);
+      break;
+
+   case 0x70:
+   case 0x71:
+   case 0x72: /* JBb/JNAEb (jump below) */
+   case 0x73: /* JNBb/JAEb (jump not below) */
+   case 0x74: /* JZb/JEb (jump zero) */
+   case 0x75: /* JNZb/JNEb (jump not zero) */
+   case 0x76: /* JBEb/JNAb (jump below or equal) */
+   case 0x77: /* JNBEb/JAb (jump not below or equal) */
+   case 0x78: /* JSb (jump negative) */
+   case 0x79: /* JSb (jump not negative) */
+   case 0x7A: /* JP (jump parity even) */
+   case 0x7B: /* JNP/JPO (jump parity odd) */
+   case 0x7C: /* JLb/JNGEb (jump less) */
+   case 0x7D: /* JGEb/JNLb (jump greater or equal) */
+   case 0x7E: /* JLEb/JNGb (jump less or equal) */
+   case 0x7F: /* JGb/JNLEb (jump greater) */
+    { Int    jmpDelta;
+      HChar* comment  = "";
+      jmpDelta = (Int)getSDisp8(delta);
+      vassert(-128 <= jmpDelta && jmpDelta < 128);
+      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta; 
+      delta++;
+      if (resteerCisOk
+          && vex_control.guest_chase_cond
+          && (Addr32)d32 != (Addr32)guest_EIP_bbstart
+          && jmpDelta < 0
+          && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
+         /* Speculation: assume this backward branch is taken.  So we
+            need to emit a side-exit to the insn following this one,
+            on the negation of the condition, and continue at the
+            branch target address (d32).  If we wind up back at the
+            first instruction of the trace, just stop; it's better to
+            let the IR loop unroller handle that case. */
+         stmt( IRStmt_Exit( 
+                  mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
+                  Ijk_Boring,
+                  IRConst_U32(guest_EIP_bbstart+delta) ) );
+         dres.whatNext   = Dis_ResteerC;
+         dres.continueAt = (Addr64)(Addr32)d32;
+         comment = "(assumed taken)";
+      }
+      else
+      if (resteerCisOk
+          && vex_control.guest_chase_cond
+          && (Addr32)d32 != (Addr32)guest_EIP_bbstart
+          && jmpDelta >= 0
+          && resteerOkFn( callback_opaque, 
+                          (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
+         /* Speculation: assume this forward branch is not taken.  So
+            we need to emit a side-exit to d32 (the dest) and continue
+            disassembling at the insn immediately following this
+            one. */
+         stmt( IRStmt_Exit( 
+                  mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
+                  Ijk_Boring,
+                  IRConst_U32(d32) ) );
+         dres.whatNext   = Dis_ResteerC;
+         dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
+         comment = "(assumed not taken)";
+      }
+      else {
+         /* Conservative default translation - end the block at this
+            point. */
+         jcc_01( (X86Condcode)(opc - 0x70), 
+                 (Addr32)(guest_EIP_bbstart+delta), d32);
+         dres.whatNext = Dis_StopHere;
+      }
+      DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
+      break;
+    }
+
+   case 0xE3: /* JECXZ (for JCXZ see above) */
+      if (sz != 4) goto decode_failure;
+      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
+      delta ++;
+      stmt( IRStmt_Exit(
+               binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
+            Ijk_Boring,
+            IRConst_U32(d32)
+          ));
+      DIP("jecxz 0x%x\n", d32);
+      break;
+
+   case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
+   case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
+   case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
+    { /* Again, the docs say this uses ECX/CX as a count depending on
+         the address size override, not the operand one.  Since we
+         don't handle address size overrides, I guess that means
+         ECX. */
+      IRExpr* zbit  = NULL;
+      IRExpr* count = NULL;
+      IRExpr* cond  = NULL;
+      HChar*  xtra  = NULL;
+
+      if (sz != 4) goto decode_failure;
+      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
+      delta++;
+      putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
+
+      count = getIReg(4,R_ECX);
+      cond = binop(Iop_CmpNE32, count, mkU32(0));
+      switch (opc) {
+         case 0xE2: 
+            xtra = ""; 
+            break;
+         case 0xE1: 
+            xtra = "e"; 
+            zbit = mk_x86g_calculate_condition( X86CondZ );
+	    cond = mkAnd1(cond, zbit);
+            break;
+         case 0xE0: 
+            xtra = "ne";
+            zbit = mk_x86g_calculate_condition( X86CondNZ );
+	    cond = mkAnd1(cond, zbit);
+            break;
+         default:
+	    vassert(0);
+      }
+      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32)) );
+
+      DIP("loop%s 0x%x\n", xtra, d32);
+      break;
+    }
+
+   /* ------------------------ IMUL ----------------------- */
+
+   case 0x69: /* IMUL Iv, Ev, Gv */
+      delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
+      break;
+   case 0x6B: /* IMUL Ib, Ev, Gv */
+      delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
+      break;
+
+   /* ------------------------ MOV ------------------------ */
+
+   case 0x88: /* MOV Gb,Eb */
+      delta = dis_mov_G_E(sorb, 1, delta);
+      break;
+
+   case 0x89: /* MOV Gv,Ev */
+      delta = dis_mov_G_E(sorb, sz, delta);
+      break;
+
+   case 0x8A: /* MOV Eb,Gb */
+      delta = dis_mov_E_G(sorb, 1, delta);
+      break;
+ 
+   case 0x8B: /* MOV Ev,Gv */
+      delta = dis_mov_E_G(sorb, sz, delta);
+      break;
+ 
+   case 0x8D: /* LEA M,Gv */
+      if (sz != 4)
+         goto decode_failure;
+      modrm = getIByte(delta);
+      if (epartIsReg(modrm)) 
+         goto decode_failure;
+      /* NOTE!  this is the one place where a segment override prefix
+         has no effect on the address calculation.  Therefore we pass
+         zero instead of sorb here. */
+      addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
+      delta += alen;
+      putIReg(sz, gregOfRM(modrm), mkexpr(addr));
+      DIP("lea%c %s, %s\n", nameISize(sz), dis_buf, 
+                            nameIReg(sz,gregOfRM(modrm)));
+      break;
+
+   case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
+      delta = dis_mov_Sw_Ew(sorb, sz, delta);
+      break;
+
+   case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
+      delta = dis_mov_Ew_Sw(sorb, delta);
+      break;
+ 
+   case 0xA0: /* MOV Ob,AL */
+      sz = 1;
+      /* Fall through ... */
+   case 0xA1: /* MOV Ov,eAX */
+      d32 = getUDisp32(delta); delta += 4;
+      ty = szToITy(sz);
+      addr = newTemp(Ity_I32);
+      assign( addr, handleSegOverride(sorb, mkU32(d32)) );
+      putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
+      DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
+                                d32, nameIReg(sz,R_EAX));
+      break;
+
+   case 0xA2: /* MOV Ob,AL */
+      sz = 1;
+      /* Fall through ... */
+   case 0xA3: /* MOV eAX,Ov */
+      d32 = getUDisp32(delta); delta += 4;
+      ty = szToITy(sz);
+      addr = newTemp(Ity_I32);
+      assign( addr, handleSegOverride(sorb, mkU32(d32)) );
+      storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
+      DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
+                                sorbTxt(sorb), d32);
+      break;
+
+   case 0xB0: /* MOV imm,AL */
+   case 0xB1: /* MOV imm,CL */
+   case 0xB2: /* MOV imm,DL */
+   case 0xB3: /* MOV imm,BL */
+   case 0xB4: /* MOV imm,AH */
+   case 0xB5: /* MOV imm,CH */
+   case 0xB6: /* MOV imm,DH */
+   case 0xB7: /* MOV imm,BH */
+      d32 = getIByte(delta); delta += 1;
+      putIReg(1, opc-0xB0, mkU8(d32));
+      DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
+      break;
+
+   case 0xB8: /* MOV imm,eAX */
+   case 0xB9: /* MOV imm,eCX */
+   case 0xBA: /* MOV imm,eDX */
+   case 0xBB: /* MOV imm,eBX */
+   case 0xBC: /* MOV imm,eSP */
+   case 0xBD: /* MOV imm,eBP */
+   case 0xBE: /* MOV imm,eSI */
+   case 0xBF: /* MOV imm,eDI */
+      d32 = getUDisp(sz,delta); delta += sz;
+      putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
+      DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
+      break;
+
+   case 0xC6: /* MOV Ib,Eb */
+      sz = 1;
+      goto do_Mov_I_E;
+   case 0xC7: /* MOV Iv,Ev */
+      goto do_Mov_I_E;
+
+   do_Mov_I_E:
+      modrm = getIByte(delta);
+      if (epartIsReg(modrm)) {
+         delta++; /* mod/rm byte */
+         d32 = getUDisp(sz,delta); delta += sz;
+         putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
+         DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, 
+                                  nameIReg(sz,eregOfRM(modrm)));
+      } else {
+         addr = disAMode ( &alen, sorb, delta, dis_buf );
+         delta += alen;
+         d32 = getUDisp(sz,delta); delta += sz;
+         storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
+         DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
+      }
+      break;
+
+   /* ------------------------ opl imm, A ----------------- */
+
+   case 0x04: /* ADD Ib, AL */
+      delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
+      break;
+   case 0x05: /* ADD Iv, eAX */
+      delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
+      break;
+
+   case 0x0C: /* OR Ib, AL */
+      delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
+      break;
+   case 0x0D: /* OR Iv, eAX */
+      delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
+      break;
+
+   case 0x14: /* ADC Ib, AL */
+      delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
+      break;
+   case 0x15: /* ADC Iv, eAX */
+      delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
+      break;
+
+   case 0x1C: /* SBB Ib, AL */
+      delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
+      break;
+   case 0x1D: /* SBB Iv, eAX */
+      delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
+      break;
+
+   case 0x24: /* AND Ib, AL */
+      delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
+      break;
+   case 0x25: /* AND Iv, eAX */
+      delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
+      break;
+
+   case 0x2C: /* SUB Ib, AL */
+      delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
+      break;
+   case 0x2D: /* SUB Iv, eAX */
+      delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
+      break;
+
+   case 0x34: /* XOR Ib, AL */
+      delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
+      break;
+   case 0x35: /* XOR Iv, eAX */
+      delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
+      break;
+
+   case 0x3C: /* CMP Ib, AL */
+      delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
+      break;
+   case 0x3D: /* CMP Iv, eAX */
+      delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
+      break;
+
+   case 0xA8: /* TEST Ib, AL */
+      delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
+      break;
+   case 0xA9: /* TEST Iv, eAX */
+      delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
+      break;
+
+   /* ------------------------ opl Ev, Gv ----------------- */
+ 
+   case 0x02: /* ADD Eb,Gb */
+      delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
+      break;
+   case 0x03: /* ADD Ev,Gv */
+      delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
+      break;
+
+   case 0x0A: /* OR Eb,Gb */
+      delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
+      break;
+   case 0x0B: /* OR Ev,Gv */
+      delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
+      break;
+
+   case 0x12: /* ADC Eb,Gb */
+      delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
+      break;
+   case 0x13: /* ADC Ev,Gv */
+      delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
+      break;
+
+   case 0x1A: /* SBB Eb,Gb */
+      delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
+      break;
+   case 0x1B: /* SBB Ev,Gv */
+      delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
+      break;
+
+   case 0x22: /* AND Eb,Gb */
+      delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
+      break;
+   case 0x23: /* AND Ev,Gv */
+      delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
+      break;
+
+   case 0x2A: /* SUB Eb,Gb */
+      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
+      break;
+   case 0x2B: /* SUB Ev,Gv */
+      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
+      break;
+
+   case 0x32: /* XOR Eb,Gb */
+      delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
+      break;
+   case 0x33: /* XOR Ev,Gv */
+      delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
+      break;
+
+   case 0x3A: /* CMP Eb,Gb */
+      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
+      break;
+   case 0x3B: /* CMP Ev,Gv */
+      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
+      break;
+
+   case 0x84: /* TEST Eb,Gb */
+      delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
+      break;
+   case 0x85: /* TEST Ev,Gv */
+      delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
+      break;
+
+   /* ------------------------ opl Gv, Ev ----------------- */
+
+   case 0x00: /* ADD Gb,Eb */
+      delta = dis_op2_G_E ( sorb, pfx_lock, False,
+                            Iop_Add8, True, 1, delta, "add" );
+      break;
+   case 0x01: /* ADD Gv,Ev */
+      delta = dis_op2_G_E ( sorb, pfx_lock, False,
+                            Iop_Add8, True, sz, delta, "add" );
+      break;
+
+   case 0x08: /* OR Gb,Eb */
+      delta = dis_op2_G_E ( sorb, pfx_lock, False,
+                            Iop_Or8, True, 1, delta, "or" );
+      break;
+   case 0x09: /* OR Gv,Ev */
+      delta = dis_op2_G_E ( sorb, pfx_lock, False,
+                            Iop_Or8, True, sz, delta, "or" );
+      break;
+
+   case 0x10: /* ADC Gb,Eb */
+      delta = dis_op2_G_E ( sorb, pfx_lock, True,
+                            Iop_Add8, True, 1, delta, "adc" );
+      break;
+   case 0x11: /* ADC Gv,Ev */
+      delta = dis_op2_G_E ( sorb, pfx_lock, True,
+                            Iop_Add8, True, sz, delta, "adc" );
+      break;
+
+   case 0x18: /* SBB Gb,Eb */
+      delta = dis_op2_G_E ( sorb, pfx_lock, True,
+                            Iop_Sub8, True, 1, delta, "sbb" );
+      break;
+   case 0x19: /* SBB Gv,Ev */
+      delta = dis_op2_G_E ( sorb, pfx_lock, True,
+                            Iop_Sub8, True, sz, delta, "sbb" );
+      break;
+
+   case 0x20: /* AND Gb,Eb */
+      delta = dis_op2_G_E ( sorb, pfx_lock, False,
+                            Iop_And8, True, 1, delta, "and" );
+      break;
+   case 0x21: /* AND Gv,Ev */
+      delta = dis_op2_G_E ( sorb, pfx_lock, False,
+                            Iop_And8, True, sz, delta, "and" );
+      break;
+
+   case 0x28: /* SUB Gb,Eb */
+      delta = dis_op2_G_E ( sorb, pfx_lock, False,
+                            Iop_Sub8, True, 1, delta, "sub" );
+      break;
+   case 0x29: /* SUB Gv,Ev */
+      delta = dis_op2_G_E ( sorb, pfx_lock, False,
+                            Iop_Sub8, True, sz, delta, "sub" );
+      break;
+
+   case 0x30: /* XOR Gb,Eb */
+      delta = dis_op2_G_E ( sorb, pfx_lock, False,
+                            Iop_Xor8, True, 1, delta, "xor" );
+      break;
+   case 0x31: /* XOR Gv,Ev */
+      delta = dis_op2_G_E ( sorb, pfx_lock, False,
+                            Iop_Xor8, True, sz, delta, "xor" );
+      break;
+
+   case 0x38: /* CMP Gb,Eb */
+      delta = dis_op2_G_E ( sorb, pfx_lock, False,
+                            Iop_Sub8, False, 1, delta, "cmp" );
+      break;
+   case 0x39: /* CMP Gv,Ev */
+      delta = dis_op2_G_E ( sorb, pfx_lock, False,
+                            Iop_Sub8, False, sz, delta, "cmp" );
+      break;
+
+   /* ------------------------ POP ------------------------ */
+
+   case 0x58: /* POP eAX */
+   case 0x59: /* POP eCX */
+   case 0x5A: /* POP eDX */
+   case 0x5B: /* POP eBX */
+   case 0x5D: /* POP eBP */
+   case 0x5E: /* POP eSI */
+   case 0x5F: /* POP eDI */
+   case 0x5C: /* POP eSP */
+      vassert(sz == 2 || sz == 4);
+      t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
+      assign(t2, getIReg(4, R_ESP));
+      assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
+      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
+      putIReg(sz, opc-0x58, mkexpr(t1));
+      DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
+      break;
+
+   case 0x9D: /* POPF */
+      vassert(sz == 2 || sz == 4);
+      t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
+      assign(t2, getIReg(4, R_ESP));
+      assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
+      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
+
+      /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
+	 value in t1. */
+      set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
+                                 ((Addr32)guest_EIP_bbstart)+delta );
+
+      DIP("popf%c\n", nameISize(sz));
+      break;
+
+   case 0x61: /* POPA */
+      /* This is almost certainly wrong for sz==2.  So ... */
+      if (sz != 4) goto decode_failure;
+
+      /* t5 is the old %ESP value. */
+      t5 = newTemp(Ity_I32);
+      assign( t5, getIReg(4, R_ESP) );
+
+      /* Reload all the registers, except %esp. */
+      putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
+      putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
+      putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
+      putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
+      /* ignore saved %ESP */
+      putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
+      putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
+      putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
+
+      /* and move %ESP back up */
+      putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
+
+      DIP("popa%c\n", nameISize(sz));
+      break;
+
+   case 0x8F: /* POPL/POPW m32 */
+     { Int    len;
+       UChar  rm = getIByte(delta);
+
+       /* make sure this instruction is correct POP */
+       if (epartIsReg(rm) || gregOfRM(rm) != 0)
+          goto decode_failure;
+       /* and has correct size */
+       if (sz != 4 && sz != 2)
+          goto decode_failure;
+       ty = szToITy(sz);
+
+       t1 = newTemp(Ity_I32); /* stack address */
+       t3 = newTemp(ty); /* data */
+       /* set t1 to ESP: t1 = ESP */
+       assign( t1, getIReg(4, R_ESP) );
+       /* load M[ESP] to virtual register t3: t3 = M[t1] */
+       assign( t3, loadLE(ty, mkexpr(t1)) );
+       
+       /* increase ESP; must be done before the STORE.  Intel manual says:
+            If the ESP register is used as a base register for addressing
+            a destination operand in memory, the POP instruction computes
+            the effective address of the operand after it increments the
+            ESP register.
+       */
+       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
+
+       /* resolve MODR/M */
+       addr = disAMode ( &len, sorb, delta, dis_buf);
+       storeLE( mkexpr(addr), mkexpr(t3) );
+
+       DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
+
+       delta += len;
+       break;
+     }
+
+   case 0x1F: /* POP %DS */
+      dis_pop_segreg( R_DS, sz ); break;
+   case 0x07: /* POP %ES */
+      dis_pop_segreg( R_ES, sz ); break;
+   case 0x17: /* POP %SS */
+      dis_pop_segreg( R_SS, sz ); break;
+
+   /* ------------------------ PUSH ----------------------- */
+
+   case 0x50: /* PUSH eAX */
+   case 0x51: /* PUSH eCX */
+   case 0x52: /* PUSH eDX */
+   case 0x53: /* PUSH eBX */
+   case 0x55: /* PUSH eBP */
+   case 0x56: /* PUSH eSI */
+   case 0x57: /* PUSH eDI */
+   case 0x54: /* PUSH eSP */
+      /* This is the Right Way, in that the value to be pushed is
+         established before %esp is changed, so that pushl %esp
+         correctly pushes the old value. */
+      vassert(sz == 2 || sz == 4);
+      ty = sz==2 ? Ity_I16 : Ity_I32;
+      t1 = newTemp(ty); t2 = newTemp(Ity_I32);
+      assign(t1, getIReg(sz, opc-0x50));
+      assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
+      putIReg(4, R_ESP, mkexpr(t2) );
+      storeLE(mkexpr(t2),mkexpr(t1));
+      DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
+      break;
+
+
+   case 0x68: /* PUSH Iv */
+      d32 = getUDisp(sz,delta); delta += sz;
+      goto do_push_I;
+   case 0x6A: /* PUSH Ib, sign-extended to sz */
+      d32 = getSDisp8(delta); delta += 1;
+      goto do_push_I;
+   do_push_I:
+      ty = szToITy(sz);
+      t1 = newTemp(Ity_I32); t2 = newTemp(ty);
+      assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
+      putIReg(4, R_ESP, mkexpr(t1) );
+      /* stop mkU16 asserting if d32 is a negative 16-bit number
+         (bug #132813) */
+      if (ty == Ity_I16)
+         d32 &= 0xFFFF;
+      storeLE( mkexpr(t1), mkU(ty,d32) );
+      DIP("push%c $0x%x\n", nameISize(sz), d32);
+      break;
+
+   case 0x9C: /* PUSHF */ {
+      vassert(sz == 2 || sz == 4);
+
+      t1 = newTemp(Ity_I32);
+      assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
+      putIReg(4, R_ESP, mkexpr(t1) );
+
+      /* Calculate OSZACP, and patch in fixed fields as per
+         Intel docs. 
+         - bit 1 is always 1
+         - bit 9 is Interrupt Enable (should always be 1 in user mode?)
+      */
+      t2 = newTemp(Ity_I32);
+      assign( t2, binop(Iop_Or32, 
+                        mk_x86g_calculate_eflags_all(), 
+                        mkU32( (1<<1)|(1<<9) ) ));
+
+      /* Patch in the D flag.  This can simply be a copy of bit 10 of
+         baseBlock[OFFB_DFLAG]. */
+      t3 = newTemp(Ity_I32);
+      assign( t3, binop(Iop_Or32,
+                        mkexpr(t2),
+                        binop(Iop_And32,
+                              IRExpr_Get(OFFB_DFLAG,Ity_I32),
+                              mkU32(1<<10))) 
+            );
+
+      /* And patch in the ID flag. */
+      t4 = newTemp(Ity_I32);
+      assign( t4, binop(Iop_Or32,
+                        mkexpr(t3),
+                        binop(Iop_And32,
+                              binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32), 
+                                               mkU8(21)),
+                              mkU32(1<<21)))
+            );
+
+      /* And patch in the AC flag. */
+      t5 = newTemp(Ity_I32);
+      assign( t5, binop(Iop_Or32,
+                        mkexpr(t4),
+                        binop(Iop_And32,
+                              binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32), 
+                                               mkU8(18)),
+                              mkU32(1<<18)))
+            );
+
+      /* if sz==2, the stored value needs to be narrowed. */
+      if (sz == 2)
+        storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
+      else 
+        storeLE( mkexpr(t1), mkexpr(t5) );
+
+      DIP("pushf%c\n", nameISize(sz));
+      break;
+   }
+
+   case 0x60: /* PUSHA */
+      /* This is almost certainly wrong for sz==2.  So ... */
+      if (sz != 4) goto decode_failure;
+
+      /* This is the Right Way, in that the value to be pushed is
+         established before %esp is changed, so that pusha
+         correctly pushes the old %esp value.  New value of %esp is
+         pushed at start. */
+      /* t0 is the %ESP value we're going to push. */
+      t0 = newTemp(Ity_I32);
+      assign( t0, getIReg(4, R_ESP) );
+
+      /* t5 will be the new %ESP value. */
+      t5 = newTemp(Ity_I32);
+      assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
+
+      /* Update guest state before prodding memory. */
+      putIReg(4, R_ESP, mkexpr(t5));
+
+      /* Dump all the registers. */
+      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
+      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
+      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
+      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
+      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
+      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
+      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
+      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
+
+      DIP("pusha%c\n", nameISize(sz));
+      break;
+
+   case 0x0E: /* PUSH %CS */
+      dis_push_segreg( R_CS, sz ); break;
+   case 0x1E: /* PUSH %DS */
+      dis_push_segreg( R_DS, sz ); break;
+   case 0x06: /* PUSH %ES */
+      dis_push_segreg( R_ES, sz ); break;
+   case 0x16: /* PUSH %SS */
+      dis_push_segreg( R_SS, sz ); break;
+
+   /* ------------------------ SCAS et al ----------------- */
+
+   case 0xA4: /* MOVS, no REP prefix */
+   case 0xA5: 
+      if (sorb != 0)
+         goto decode_failure; /* else dis_string_op asserts */
+      dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
+      break;
+
+  case 0xA6: /* CMPSb, no REP prefix */
+  case 0xA7:
+      if (sorb != 0)
+         goto decode_failure; /* else dis_string_op asserts */
+      dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
+      break;
+
+   case 0xAA: /* STOS, no REP prefix */
+   case 0xAB:
+      if (sorb != 0)
+         goto decode_failure; /* else dis_string_op asserts */
+      dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
+      break;
+
+   case 0xAC: /* LODS, no REP prefix */
+   case 0xAD:
+      if (sorb != 0)
+         goto decode_failure; /* else dis_string_op asserts */
+      dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
+      break;
+
+   case 0xAE: /* SCAS, no REP prefix */
+   case 0xAF:
+      if (sorb != 0) 
+         goto decode_failure; /* else dis_string_op asserts */
+      dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
+      break;
+
+
+   case 0xFC: /* CLD */
+      stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
+      DIP("cld\n");
+      break;
+
+   case 0xFD: /* STD */
+      stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
+      DIP("std\n");
+      break;
+
+   case 0xF8: /* CLC */
+   case 0xF9: /* STC */
+   case 0xF5: /* CMC */
+      t0 = newTemp(Ity_I32);
+      t1 = newTemp(Ity_I32);
+      assign( t0, mk_x86g_calculate_eflags_all() );
+      switch (opc) {
+         case 0xF8: 
+            assign( t1, binop(Iop_And32, mkexpr(t0), 
+                                         mkU32(~X86G_CC_MASK_C)));
+            DIP("clc\n");
+            break;
+         case 0xF9: 
+            assign( t1, binop(Iop_Or32, mkexpr(t0), 
+                                        mkU32(X86G_CC_MASK_C)));
+            DIP("stc\n");
+            break;
+         case 0xF5: 
+            assign( t1, binop(Iop_Xor32, mkexpr(t0), 
+                                         mkU32(X86G_CC_MASK_C)));
+            DIP("cmc\n");
+            break;
+         default: 
+            vpanic("disInstr(x86)(clc/stc/cmc)");
+      }
+      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
+      /* Set NDEP even though it isn't used.  This makes redundant-PUT
+         elimination of previous stores to this field work better. */
+      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+      break;
+
+   case 0xD6: /* SALC */
+      t0 = newTemp(Ity_I32);
+      t1 = newTemp(Ity_I32);
+      assign( t0,  binop(Iop_And32,
+                         mk_x86g_calculate_eflags_c(),
+                         mkU32(1)) );
+      assign( t1, binop(Iop_Sar32, 
+                        binop(Iop_Shl32, mkexpr(t0), mkU8(31)), 
+                        mkU8(31)) );
+      putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
+      DIP("salc\n");
+      break;
+
+   /* REPNE prefix insn */
+   case 0xF2: { 
+      Addr32 eip_orig = guest_EIP_bbstart + delta_start;
+      if (sorb != 0) goto decode_failure;
+      abyte = getIByte(delta); delta++;
+
+      if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
+      dres.whatNext = Dis_StopHere;         
+
+      switch (abyte) {
+      /* According to the Intel manual, "repne movs" should never occur, but
+       * in practice it has happened, so allow for it here... */
+      case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
+      case 0xA5: 
+         dis_REP_op ( X86CondNZ, dis_MOVS, sz, eip_orig,
+                                 guest_EIP_bbstart+delta, "repne movs" );
+         break;
+
+      case 0xA6: sz = 1;   /* REPNE CMP<sz> */
+      case 0xA7:
+         dis_REP_op ( X86CondNZ, dis_CMPS, sz, eip_orig, 
+                                 guest_EIP_bbstart+delta, "repne cmps" );
+         break;
+
+      case 0xAA: sz = 1;   /* REPNE STOS<sz> */
+      case 0xAB:
+         dis_REP_op ( X86CondNZ, dis_STOS, sz, eip_orig, 
+                                 guest_EIP_bbstart+delta, "repne stos" );
+         break;
+
+      case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
+      case 0xAF:
+         dis_REP_op ( X86CondNZ, dis_SCAS, sz, eip_orig,
+                                 guest_EIP_bbstart+delta, "repne scas" );
+         break;
+
+      default:
+         goto decode_failure;
+      }
+      break;
+   }
+
+   /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
+      for the rest, it means REP) */
+   case 0xF3: { 
+      Addr32 eip_orig = guest_EIP_bbstart + delta_start;
+      if (sorb != 0) goto decode_failure;
+      abyte = getIByte(delta); delta++;
+
+      if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
+      dres.whatNext = Dis_StopHere;
+
+      switch (abyte) {
+      case 0xA4: sz = 1;   /* REP MOVS<sz> */
+      case 0xA5:
+         dis_REP_op ( X86CondAlways, dis_MOVS, sz, eip_orig, 
+                                     guest_EIP_bbstart+delta, "rep movs" );
+         break;
+
+      case 0xA6: sz = 1;   /* REPE CMP<sz> */
+      case 0xA7:
+         dis_REP_op ( X86CondZ, dis_CMPS, sz, eip_orig, 
+                                guest_EIP_bbstart+delta, "repe cmps" );
+         break;
+
+      case 0xAA: sz = 1;   /* REP STOS<sz> */
+      case 0xAB:
+         dis_REP_op ( X86CondAlways, dis_STOS, sz, eip_orig, 
+                                     guest_EIP_bbstart+delta, "rep stos" );
+         break;
+
+      case 0xAC: sz = 1;   /* REP LODS<sz> */
+      case 0xAD:
+         dis_REP_op ( X86CondAlways, dis_LODS, sz, eip_orig, 
+                                     guest_EIP_bbstart+delta, "rep lods" );
+         break;
+
+      case 0xAE: sz = 1;   /* REPE SCAS<sz> */
+      case 0xAF: 
+         dis_REP_op ( X86CondZ, dis_SCAS, sz, eip_orig, 
+                                guest_EIP_bbstart+delta, "repe scas" );
+         break;
+      
+      case 0x90:           /* REP NOP (PAUSE) */
+         /* a hint to the P4 re spin-wait loop */
+         DIP("rep nop (P4 pause)\n");
+         /* "observe" the hint.  The Vex client needs to be careful not
+            to cause very long delays as a result, though. */
+         jmp_lit(Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
+         dres.whatNext = Dis_StopHere;
+         break;
+
+      case 0xC3:           /* REP RET -- same as normal ret? */
+         dis_ret(0);
+         dres.whatNext = Dis_StopHere;
+         DIP("rep ret\n");
+         break;
+
+      default:
+         goto decode_failure;
+      }
+      break;
+   }
+
+   /* ------------------------ XCHG ----------------------- */
+
+   /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
+      prefix; hence it must be translated with an IRCAS (at least, the
+      memory variant). */
+   case 0x86: /* XCHG Gb,Eb */
+      sz = 1;
+      /* Fall through ... */
+   case 0x87: /* XCHG Gv,Ev */
+      modrm = getIByte(delta);
+      ty = szToITy(sz);
+      t1 = newTemp(ty); t2 = newTemp(ty);
+      if (epartIsReg(modrm)) {
+         assign(t1, getIReg(sz, eregOfRM(modrm)));
+         assign(t2, getIReg(sz, gregOfRM(modrm)));
+         putIReg(sz, gregOfRM(modrm), mkexpr(t1));
+         putIReg(sz, eregOfRM(modrm), mkexpr(t2));
+         delta++;
+         DIP("xchg%c %s, %s\n", 
+             nameISize(sz), nameIReg(sz,gregOfRM(modrm)), 
+                            nameIReg(sz,eregOfRM(modrm)));
+      } else {
+         *expect_CAS = True;
+         addr = disAMode ( &alen, sorb, delta, dis_buf );
+         assign( t1, loadLE(ty,mkexpr(addr)) );
+         assign( t2, getIReg(sz,gregOfRM(modrm)) );
+         casLE( mkexpr(addr),
+                mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
+         putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
+         delta += alen;
+         DIP("xchg%c %s, %s\n", nameISize(sz), 
+                                nameIReg(sz,gregOfRM(modrm)), dis_buf);
+      }
+      break;
+
+   case 0x90: /* XCHG eAX,eAX */
+      DIP("nop\n");
+      break;
+   case 0x91: /* XCHG eAX,eCX */
+   case 0x92: /* XCHG eAX,eDX */
+   case 0x93: /* XCHG eAX,eBX */
+   case 0x94: /* XCHG eAX,eSP */
+   case 0x95: /* XCHG eAX,eBP */
+   case 0x96: /* XCHG eAX,eSI */
+   case 0x97: /* XCHG eAX,eDI */
+      codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
+      break;
+
+   /* ------------------------ XLAT ----------------------- */
+
+   case 0xD7: /* XLAT */
+      if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
+      putIReg( 
+         1, 
+         R_EAX/*AL*/,
+         loadLE(Ity_I8, 
+                handleSegOverride( 
+                   sorb, 
+                   binop(Iop_Add32, 
+                         getIReg(4, R_EBX), 
+                         unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
+
+      DIP("xlat%c [ebx]\n", nameISize(sz));
+      break;
+
+   /* ------------------------ IN / OUT ----------------------- */
+
+   case 0xE4: /* IN imm8, AL */
+      sz = 1; 
+      t1 = newTemp(Ity_I32);
+      abyte = getIByte(delta); delta++;
+      assign(t1, mkU32( abyte & 0xFF ));
+      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
+      goto do_IN;
+   case 0xE5: /* IN imm8, eAX */
+      vassert(sz == 2 || sz == 4);
+      t1 = newTemp(Ity_I32);
+      abyte = getIByte(delta); delta++;
+      assign(t1, mkU32( abyte & 0xFF ));
+      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
+      goto do_IN;
+   case 0xEC: /* IN %DX, AL */
+      sz = 1; 
+      t1 = newTemp(Ity_I32);
+      assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
+      DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX), 
+                                         nameIReg(sz,R_EAX));
+      goto do_IN;
+   case 0xED: /* IN %DX, eAX */
+      vassert(sz == 2 || sz == 4);
+      t1 = newTemp(Ity_I32);
+      assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
+      DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX), 
+                                         nameIReg(sz,R_EAX));
+      goto do_IN;
+   do_IN: {
+      /* At this point, sz indicates the width, and t1 is a 32-bit
+         value giving port number. */
+      IRDirty* d;
+      vassert(sz == 1 || sz == 2 || sz == 4);
+      ty = szToITy(sz);
+      t2 = newTemp(Ity_I32);
+      d = unsafeIRDirty_1_N( 
+             t2,
+             0/*regparms*/, 
+             "x86g_dirtyhelper_IN", 
+             &x86g_dirtyhelper_IN,
+             mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
+          );
+      /* do the call, dumping the result in t2. */
+      stmt( IRStmt_Dirty(d) );
+      putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
+      break;
+   }
+
+   case 0xE6: /* OUT AL, imm8 */
+      sz = 1;
+      t1 = newTemp(Ity_I32);
+      abyte = getIByte(delta); delta++;
+      assign( t1, mkU32( abyte & 0xFF ) );
+      DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
+      goto do_OUT;
+   case 0xE7: /* OUT eAX, imm8 */
+      vassert(sz == 2 || sz == 4);
+      t1 = newTemp(Ity_I32);
+      abyte = getIByte(delta); delta++;
+      assign( t1, mkU32( abyte & 0xFF ) );
+      DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
+      goto do_OUT;
+   case 0xEE: /* OUT AL, %DX */
+      sz = 1;
+      t1 = newTemp(Ity_I32);
+      assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
+      DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
+                                          nameIReg(2,R_EDX));
+      goto do_OUT;
+   case 0xEF: /* OUT eAX, %DX */
+      vassert(sz == 2 || sz == 4);
+      t1 = newTemp(Ity_I32);
+      assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
+      DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
+                                          nameIReg(2,R_EDX));
+      goto do_OUT;
+   do_OUT: {
+      /* At this point, sz indicates the width, and t1 is a 32-bit
+         value giving port number. */
+      IRDirty* d;
+      vassert(sz == 1 || sz == 2 || sz == 4);
+      ty = szToITy(sz);
+      d = unsafeIRDirty_0_N( 
+             0/*regparms*/, 
+             "x86g_dirtyhelper_OUT", 
+             &x86g_dirtyhelper_OUT,
+             mkIRExprVec_3( mkexpr(t1),
+                            widenUto32( getIReg(sz, R_EAX) ), 
+                            mkU32(sz) )
+          );
+      stmt( IRStmt_Dirty(d) );
+      break;
+   }
+
+   /* ------------------------ (Grp1 extensions) ---------- */
+
+   case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as 
+                 case 0x80, but only in 32-bit mode. */
+      /* fallthru */
+   case 0x80: /* Grp1 Ib,Eb */
+      modrm = getIByte(delta);
+      am_sz = lengthAMode(delta);
+      sz    = 1;
+      d_sz  = 1;
+      d32   = getUChar(delta + am_sz);
+      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
+      break;
+
+   case 0x81: /* Grp1 Iv,Ev */
+      modrm = getIByte(delta);
+      am_sz = lengthAMode(delta);
+      d_sz  = sz;
+      d32   = getUDisp(d_sz, delta + am_sz);
+      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
+      break;
+
+   case 0x83: /* Grp1 Ib,Ev */
+      modrm = getIByte(delta);
+      am_sz = lengthAMode(delta);
+      d_sz  = 1;
+      d32   = getSDisp8(delta + am_sz);
+      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
+      break;
+
+   /* ------------------------ (Grp2 extensions) ---------- */
+
+   case 0xC0: { /* Grp2 Ib,Eb */
+      Bool decode_OK = True;
+      modrm = getIByte(delta);
+      am_sz = lengthAMode(delta);
+      d_sz  = 1;
+      d32   = getUChar(delta + am_sz);
+      sz    = 1;
+      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz, 
+                         mkU8(d32 & 0xFF), NULL, &decode_OK );
+      if (!decode_OK)
+         goto decode_failure;
+      break;
+   }
+   case 0xC1: { /* Grp2 Ib,Ev */
+      Bool decode_OK = True;
+      modrm = getIByte(delta);
+      am_sz = lengthAMode(delta);
+      d_sz  = 1;
+      d32   = getUChar(delta + am_sz);
+      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz, 
+                         mkU8(d32 & 0xFF), NULL, &decode_OK );
+      if (!decode_OK)
+         goto decode_failure;
+      break;
+   }
+   case 0xD0: { /* Grp2 1,Eb */
+      Bool decode_OK = True;
+      modrm = getIByte(delta);
+      am_sz = lengthAMode(delta);
+      d_sz  = 0;
+      d32   = 1;
+      sz    = 1;
+      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz, 
+                         mkU8(d32), NULL, &decode_OK );
+      if (!decode_OK)
+         goto decode_failure;
+      break;
+   }
+   case 0xD1: { /* Grp2 1,Ev */
+      Bool decode_OK = True;
+      modrm = getUChar(delta);
+      am_sz = lengthAMode(delta);
+      d_sz  = 0;
+      d32   = 1;
+      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz, 
+                         mkU8(d32), NULL, &decode_OK );
+      if (!decode_OK)
+         goto decode_failure;
+      break;
+   }
+   case 0xD2: { /* Grp2 CL,Eb */
+      Bool decode_OK = True;
+      modrm = getUChar(delta);
+      am_sz = lengthAMode(delta);
+      d_sz  = 0;
+      sz    = 1;
+      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz, 
+                         getIReg(1,R_ECX), "%cl", &decode_OK );
+      if (!decode_OK)
+         goto decode_failure;
+      break;
+   }
+   case 0xD3: { /* Grp2 CL,Ev */
+      Bool decode_OK = True;
+      modrm = getIByte(delta);
+      am_sz = lengthAMode(delta);
+      d_sz  = 0;
+      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz, 
+                         getIReg(1,R_ECX), "%cl", &decode_OK );
+      if (!decode_OK)
+         goto decode_failure;
+      break;
+   }
+
+   /* ------------------------ (Grp3 extensions) ---------- */
+
+   case 0xF6: { /* Grp3 Eb */
+      Bool decode_OK = True;
+      delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
+      if (!decode_OK)
+         goto decode_failure;
+      break;
+   }
+   case 0xF7: { /* Grp3 Ev */
+      Bool decode_OK = True;
+      delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
+      if (!decode_OK)
+         goto decode_failure;
+      break;
+   }
+
+   /* ------------------------ (Grp4 extensions) ---------- */
+
+   case 0xFE: { /* Grp4 Eb */
+      Bool decode_OK = True;
+      delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
+      if (!decode_OK)
+         goto decode_failure;
+      break;
+   }
+
+   /* ------------------------ (Grp5 extensions) ---------- */
+
+   case 0xFF: { /* Grp5 Ev */
+      Bool decode_OK = True;
+      delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
+      if (!decode_OK)
+         goto decode_failure;
+      break;
+   }
+
+   /* ------------------------ Escapes to 2-byte opcodes -- */
+
+   case 0x0F: {
+      opc = getIByte(delta); delta++;
+      switch (opc) {
+
+      /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
+
+      case 0xBA: { /* Grp8 Ib,Ev */
+         Bool decode_OK = False;
+         modrm = getUChar(delta);
+         am_sz = lengthAMode(delta);
+         d32   = getSDisp8(delta + am_sz);
+         delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm, 
+                                am_sz, sz, d32, &decode_OK );
+         if (!decode_OK)
+            goto decode_failure;
+         break;
+      }
+
+      /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
+
+      case 0xBC: /* BSF Gv,Ev */
+         delta = dis_bs_E_G ( sorb, sz, delta, True );
+         break;
+      case 0xBD: /* BSR Gv,Ev */
+         delta = dis_bs_E_G ( sorb, sz, delta, False );
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
+
+      case 0xC8: /* BSWAP %eax */
+      case 0xC9:
+      case 0xCA:
+      case 0xCB:
+      case 0xCC:
+      case 0xCD:
+      case 0xCE:
+      case 0xCF: /* BSWAP %edi */
+         /* AFAICS from the Intel docs, this only exists at size 4. */
+         vassert(sz == 4);
+         t1 = newTemp(Ity_I32);
+         t2 = newTemp(Ity_I32);
+         assign( t1, getIReg(4, opc-0xC8) );
+
+         assign( t2,
+            binop(Iop_Or32,
+               binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
+            binop(Iop_Or32,
+               binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)), 
+                                mkU32(0x00FF0000)),
+            binop(Iop_Or32,
+               binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
+                                mkU32(0x0000FF00)),
+               binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
+                                mkU32(0x000000FF) )
+            )))
+         );
+
+         putIReg(4, opc-0xC8, mkexpr(t2));
+         DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
+
+      case 0xA3: /* BT Gv,Ev */
+         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
+         break;
+      case 0xB3: /* BTR Gv,Ev */
+         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
+         break;
+      case 0xAB: /* BTS Gv,Ev */
+         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
+         break;
+      case 0xBB: /* BTC Gv,Ev */
+         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
+ 
+      case 0x40:
+      case 0x41:
+      case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
+      case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
+      case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
+      case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
+      case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
+      case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
+      case 0x48: /* CMOVSb (cmov negative) */
+      case 0x49: /* CMOVSb (cmov not negative) */
+      case 0x4A: /* CMOVP (cmov parity even) */
+      case 0x4B: /* CMOVNP (cmov parity odd) */
+      case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
+      case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
+      case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
+      case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
+         delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
+
+      case 0xB0: /* CMPXCHG Gb,Eb */
+         delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
+         break;
+      case 0xB1: /* CMPXCHG Gv,Ev */
+         delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
+         break;
+
+      case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
+         IRTemp expdHi    = newTemp(Ity_I32);
+         IRTemp expdLo    = newTemp(Ity_I32);
+         IRTemp dataHi    = newTemp(Ity_I32);
+         IRTemp dataLo    = newTemp(Ity_I32);
+         IRTemp oldHi     = newTemp(Ity_I32);
+         IRTemp oldLo     = newTemp(Ity_I32);
+         IRTemp flags_old = newTemp(Ity_I32);
+         IRTemp flags_new = newTemp(Ity_I32);
+         IRTemp success   = newTemp(Ity_I1);
+
+         /* Translate this using a DCAS, even if there is no LOCK
+            prefix.  Life is too short to bother with generating two
+            different translations for the with/without-LOCK-prefix
+            cases. */
+         *expect_CAS = True;
+
+	 /* Decode, and generate address. */
+         if (sz != 4) goto decode_failure;
+         modrm = getIByte(delta);
+         if (epartIsReg(modrm)) goto decode_failure;
+         if (gregOfRM(modrm) != 1) goto decode_failure;
+         addr = disAMode ( &alen, sorb, delta, dis_buf );
+         delta += alen;
+
+         /* Get the expected and new values. */
+         assign( expdHi, getIReg(4,R_EDX) );
+         assign( expdLo, getIReg(4,R_EAX) );
+         assign( dataHi, getIReg(4,R_ECX) );
+         assign( dataLo, getIReg(4,R_EBX) );
+
+         /* Do the DCAS */
+         stmt( IRStmt_CAS(
+                  mkIRCAS( oldHi, oldLo, 
+                           Iend_LE, mkexpr(addr), 
+                           mkexpr(expdHi), mkexpr(expdLo),
+                           mkexpr(dataHi), mkexpr(dataLo)
+               )));
+
+         /* success when oldHi:oldLo == expdHi:expdLo */
+         assign( success,
+                 binop(Iop_CasCmpEQ32,
+                       binop(Iop_Or32,
+                             binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
+                             binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
+                       ),
+                       mkU32(0)
+                 ));
+
+         /* If the DCAS is successful, that is to say oldHi:oldLo ==
+            expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
+            which is where they came from originally.  Both the actual
+            contents of these two regs, and any shadow values, are
+            unchanged.  If the DCAS fails then we're putting into
+            EDX:EAX the value seen in memory. */
+         putIReg(4, R_EDX,
+                    IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
+                                  mkexpr(oldHi),
+                                  mkexpr(expdHi)
+                ));
+         putIReg(4, R_EAX,
+                    IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
+                                  mkexpr(oldLo),
+                                  mkexpr(expdLo)
+                ));
+
+         /* Copy the success bit into the Z flag and leave the others
+            unchanged */
+         assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
+         assign( 
+            flags_new,
+            binop(Iop_Or32,
+                  binop(Iop_And32, mkexpr(flags_old), 
+                                   mkU32(~X86G_CC_MASK_Z)),
+                  binop(Iop_Shl32, 
+                        binop(Iop_And32, 
+                              unop(Iop_1Uto32, mkexpr(success)), mkU32(1)), 
+                        mkU8(X86G_CC_SHIFT_Z)) ));
+
+         stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
+         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
+         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
+         /* Set NDEP even though it isn't used.  This makes
+            redundant-PUT elimination of previous stores to this field
+            work better. */
+         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
+
+         /* Sheesh.  Aren't you glad it was me and not you that had to
+	    write and validate all this grunge? */
+
+	 DIP("cmpxchg8b %s\n", dis_buf);
+	 break;
+      }
+
+      /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
+
+      case 0xA2: { /* CPUID */
+         /* Uses dirty helper: 
+               void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
+            declared to mod eax, wr ebx, ecx, edx
+         */
+         IRDirty* d     = NULL;
+         HChar*   fName = NULL;
+         void*    fAddr = NULL;
+         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
+            fName = "x86g_dirtyhelper_CPUID_sse2";
+            fAddr = &x86g_dirtyhelper_CPUID_sse2; 
+         } 
+         else
+         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
+            fName = "x86g_dirtyhelper_CPUID_sse1";
+            fAddr = &x86g_dirtyhelper_CPUID_sse1; 
+         } 
+         else
+         if (archinfo->hwcaps == 0/*no SSE*/) {
+            fName = "x86g_dirtyhelper_CPUID_sse0";
+            fAddr = &x86g_dirtyhelper_CPUID_sse0; 
+         } else
+            vpanic("disInstr(x86)(cpuid)");
+
+         vassert(fName); vassert(fAddr);
+         d = unsafeIRDirty_0_N ( 0/*regparms*/, 
+                                 fName, fAddr, mkIRExprVec_0() );
+         /* declare guest state effects */
+         d->needsBBP = True;
+         d->nFxState = 4;
+         d->fxState[0].fx     = Ifx_Modify;
+         d->fxState[0].offset = OFFB_EAX;
+         d->fxState[0].size   = 4;
+         d->fxState[1].fx     = Ifx_Write;
+         d->fxState[1].offset = OFFB_EBX;
+         d->fxState[1].size   = 4;
+         d->fxState[2].fx     = Ifx_Modify;
+         d->fxState[2].offset = OFFB_ECX;
+         d->fxState[2].size   = 4;
+         d->fxState[3].fx     = Ifx_Write;
+         d->fxState[3].offset = OFFB_EDX;
+         d->fxState[3].size   = 4;
+         /* execute the dirty call, side-effecting guest state */
+         stmt( IRStmt_Dirty(d) );
+         /* CPUID is a serialising insn.  So, just in case someone is
+            using it as a memory fence ... */
+         stmt( IRStmt_MBE(Imbe_Fence) );
+         DIP("cpuid\n");
+         break;
+      }
+
+//--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
+//--             goto decode_failure;
+//-- 
+//--          t1 = newTemp(cb);
+//--          t2 = newTemp(cb);
+//--          t3 = newTemp(cb);
+//--          t4 = newTemp(cb);
+//--          uInstr0(cb, CALLM_S, 0);
+//-- 
+//--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
+//--          uInstr1(cb, PUSH,  4, TempReg, t1);
+//-- 
+//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
+//--          uLiteral(cb, 0);
+//--          uInstr1(cb, PUSH,  4, TempReg, t2);
+//-- 
+//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
+//--          uLiteral(cb, 0);
+//--          uInstr1(cb, PUSH,  4, TempReg, t3);
+//-- 
+//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
+//--          uLiteral(cb, 0);
+//--          uInstr1(cb, PUSH,  4, TempReg, t4);
+//-- 
+//--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
+//--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
+//-- 
+//--          uInstr1(cb, POP,   4, TempReg, t4);
+//--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
+//-- 
+//--          uInstr1(cb, POP,   4, TempReg, t3);
+//--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
+//-- 
+//--          uInstr1(cb, POP,   4, TempReg, t2);
+//--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
+//-- 
+//--          uInstr1(cb, POP,   4, TempReg, t1);
+//--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
+//-- 
+//--          uInstr0(cb, CALLM_E, 0);
+//--          DIP("cpuid\n");
+//--          break;
+//-- 
+      /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
+
+      case 0xB6: /* MOVZXb Eb,Gv */
+         if (sz != 2 && sz != 4)
+            goto decode_failure;
+         delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
+         break;
+
+      case 0xB7: /* MOVZXw Ew,Gv */
+         if (sz != 4)
+            goto decode_failure;
+         delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
+         break;
+
+      case 0xBE: /* MOVSXb Eb,Gv */
+         if (sz != 2 && sz != 4)
+            goto decode_failure;
+         delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
+         break;
+
+      case 0xBF: /* MOVSXw Ew,Gv */
+         if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
+            goto decode_failure;
+         delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
+         break;
+
+//--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
+//-- 
+//--       case 0xC3: /* MOVNTI Gv,Ev */
+//--          vg_assert(sz == 4);
+//--          modrm = getUChar(eip);
+//--          vg_assert(!epartIsReg(modrm));
+//--          t1 = newTemp(cb);
+//--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
+//--          pair = disAMode ( cb, sorb, eip, dis_buf );
+//--          t2 = LOW24(pair);
+//--          eip += HI8(pair);
+//--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
+//--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
+//--          break;
+
+      /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
+
+      case 0xAF: /* IMUL Ev, Gv */
+         delta = dis_mul_E_G ( sorb, sz, delta );
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
+
+      case 0x1F:
+         modrm = getUChar(delta);
+         if (epartIsReg(modrm)) goto decode_failure;
+         addr = disAMode ( &alen, sorb, delta, dis_buf );
+         delta += alen;
+         DIP("nop%c %s\n", nameISize(sz), dis_buf);
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
+      case 0x80:
+      case 0x81:
+      case 0x82: /* JBb/JNAEb (jump below) */
+      case 0x83: /* JNBb/JAEb (jump not below) */
+      case 0x84: /* JZb/JEb (jump zero) */
+      case 0x85: /* JNZb/JNEb (jump not zero) */
+      case 0x86: /* JBEb/JNAb (jump below or equal) */
+      case 0x87: /* JNBEb/JAb (jump not below or equal) */
+      case 0x88: /* JSb (jump negative) */
+      case 0x89: /* JSb (jump not negative) */
+      case 0x8A: /* JP (jump parity even) */
+      case 0x8B: /* JNP/JPO (jump parity odd) */
+      case 0x8C: /* JLb/JNGEb (jump less) */
+      case 0x8D: /* JGEb/JNLb (jump greater or equal) */
+      case 0x8E: /* JLEb/JNGb (jump less or equal) */
+      case 0x8F: /* JGb/JNLEb (jump greater) */
+       { Int    jmpDelta;
+         HChar* comment  = "";
+         jmpDelta = (Int)getUDisp32(delta);
+         d32 = (((Addr32)guest_EIP_bbstart)+delta+4) + jmpDelta;
+         delta += 4;
+         if (resteerCisOk
+             && vex_control.guest_chase_cond
+             && (Addr32)d32 != (Addr32)guest_EIP_bbstart
+             && jmpDelta < 0
+             && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
+            /* Speculation: assume this backward branch is taken.  So
+               we need to emit a side-exit to the insn following this
+               one, on the negation of the condition, and continue at
+               the branch target address (d32).  If we wind up back at
+               the first instruction of the trace, just stop; it's
+               better to let the IR loop unroller handle that case.*/
+            stmt( IRStmt_Exit( 
+                     mk_x86g_calculate_condition((X86Condcode)
+                                                 (1 ^ (opc - 0x80))),
+                     Ijk_Boring,
+                     IRConst_U32(guest_EIP_bbstart+delta) ) );
+            dres.whatNext   = Dis_ResteerC;
+            dres.continueAt = (Addr64)(Addr32)d32;
+            comment = "(assumed taken)";
+         }
+         else
+         if (resteerCisOk
+             && vex_control.guest_chase_cond
+             && (Addr32)d32 != (Addr32)guest_EIP_bbstart
+             && jmpDelta >= 0
+             && resteerOkFn( callback_opaque, 
+                             (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
+            /* Speculation: assume this forward branch is not taken.
+               So we need to emit a side-exit to d32 (the dest) and
+               continue disassembling at the insn immediately
+               following this one. */
+            stmt( IRStmt_Exit( 
+                     mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
+                     Ijk_Boring,
+                     IRConst_U32(d32) ) );
+            dres.whatNext   = Dis_ResteerC;
+            dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
+            comment = "(assumed not taken)";
+         }
+         else {
+            /* Conservative default translation - end the block at
+               this point. */
+            jcc_01( (X86Condcode)(opc - 0x80), 
+                    (Addr32)(guest_EIP_bbstart+delta), d32);
+            dres.whatNext = Dis_StopHere;
+         }
+         DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
+         break;
+       }
+
+      /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
+      case 0x31: { /* RDTSC */
+         IRTemp   val  = newTemp(Ity_I64);
+         IRExpr** args = mkIRExprVec_0();
+         IRDirty* d    = unsafeIRDirty_1_N ( 
+                            val, 
+                            0/*regparms*/, 
+                            "x86g_dirtyhelper_RDTSC", 
+                            &x86g_dirtyhelper_RDTSC, 
+                            args 
+                         );
+         /* execute the dirty call, dumping the result in val. */
+         stmt( IRStmt_Dirty(d) );
+         putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
+         putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
+         DIP("rdtsc\n");
+         break;
+      }
+
+      /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
+
+      case 0xA1: /* POP %FS */
+         dis_pop_segreg( R_FS, sz ); break;
+      case 0xA9: /* POP %GS */
+         dis_pop_segreg( R_GS, sz ); break;
+
+      case 0xA0: /* PUSH %FS */
+         dis_push_segreg( R_FS, sz ); break;
+      case 0xA8: /* PUSH %GS */
+         dis_push_segreg( R_GS, sz ); break;
+
+      /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
+      case 0x90:
+      case 0x91:
+      case 0x92: /* set-Bb/set-NAEb (jump below) */
+      case 0x93: /* set-NBb/set-AEb (jump not below) */
+      case 0x94: /* set-Zb/set-Eb (jump zero) */
+      case 0x95: /* set-NZb/set-NEb (jump not zero) */
+      case 0x96: /* set-BEb/set-NAb (jump below or equal) */
+      case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
+      case 0x98: /* set-Sb (jump negative) */
+      case 0x99: /* set-Sb (jump not negative) */
+      case 0x9A: /* set-P (jump parity even) */
+      case 0x9B: /* set-NP (jump parity odd) */
+      case 0x9C: /* set-Lb/set-NGEb (jump less) */
+      case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
+      case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
+      case 0x9F: /* set-Gb/set-NLEb (jump greater) */
+         t1 = newTemp(Ity_I8);
+         assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
+         modrm = getIByte(delta);
+         if (epartIsReg(modrm)) {
+            delta++;
+            putIReg(1, eregOfRM(modrm), mkexpr(t1));
+            DIP("set%s %s\n", name_X86Condcode(opc-0x90), 
+                              nameIReg(1,eregOfRM(modrm)));
+         } else {
+           addr = disAMode ( &alen, sorb, delta, dis_buf );
+           delta += alen;
+           storeLE( mkexpr(addr), mkexpr(t1) );
+           DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
+         }
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
+
+      case 0xA4: /* SHLDv imm8,Gv,Ev */
+         modrm = getIByte(delta);
+         d32   = delta + lengthAMode(delta);
+         vex_sprintf(dis_buf, "$%d", getIByte(d32));
+         delta = dis_SHLRD_Gv_Ev ( 
+                  sorb, delta, modrm, sz, 
+                  mkU8(getIByte(d32)), True, /* literal */
+                  dis_buf, True );
+         break;
+      case 0xA5: /* SHLDv %cl,Gv,Ev */
+         modrm = getIByte(delta);
+         delta = dis_SHLRD_Gv_Ev ( 
+                    sorb, delta, modrm, sz,
+                    getIReg(1,R_ECX), False, /* not literal */
+                    "%cl", True );
+         break;
+
+      case 0xAC: /* SHRDv imm8,Gv,Ev */
+         modrm = getIByte(delta);
+         d32   = delta + lengthAMode(delta);
+         vex_sprintf(dis_buf, "$%d", getIByte(d32));
+         delta = dis_SHLRD_Gv_Ev ( 
+                    sorb, delta, modrm, sz, 
+                    mkU8(getIByte(d32)), True, /* literal */
+                    dis_buf, False );
+         break;
+      case 0xAD: /* SHRDv %cl,Gv,Ev */
+         modrm = getIByte(delta);
+         delta = dis_SHLRD_Gv_Ev ( 
+                    sorb, delta, modrm, sz, 
+                    getIReg(1,R_ECX), False, /* not literal */
+                    "%cl", False );
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
+
+      case 0x34:
+         /* Simple implementation needing a long explaination.
+
+            sysenter is a kind of syscall entry.  The key thing here
+            is that the return address is not known -- that is
+            something that is beyond Vex's knowledge.  So this IR
+            forces a return to the scheduler, which can do what it
+            likes to simulate the systenter, but it MUST set this
+            thread's guest_EIP field with the continuation address
+            before resuming execution.  If that doesn't happen, the
+            thread will jump to address zero, which is probably
+            fatal. 
+         */
+
+         /* Note where we are, so we can back up the guest to this
+            point if the syscall needs to be restarted. */
+         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
+                           mkU32(guest_EIP_curr_instr) ) );
+         jmp_lit(Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
+         dres.whatNext = Dis_StopHere;
+         DIP("sysenter");
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
+
+      case 0xC0: { /* XADD Gb,Eb */
+         Bool decodeOK;
+         delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
+         if (!decodeOK) goto decode_failure;
+         break;
+      }
+      case 0xC1: { /* XADD Gv,Ev */
+         Bool decodeOK;
+         delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
+         if (!decodeOK) goto decode_failure;
+         break;
+      }
+
+      /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
+
+      case 0x71: 
+      case 0x72: 
+      case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
+
+      case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
+      case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
+      case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
+      case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xFC: 
+      case 0xFD: 
+      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xEC: 
+      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xDC:
+      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xF8: 
+      case 0xF9: 
+      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xE8: 
+      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xD8: 
+      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0x74: 
+      case 0x75: 
+      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0x64: 
+      case 0x65: 
+      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0x68: 
+      case 0x69: 
+      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0x60: 
+      case 0x61: 
+      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
+
+      case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xF2: 
+      case 0xF3: 
+
+      case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xD2: 
+      case 0xD3: 
+
+      case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
+      case 0xE2: 
+      {
+         Int  delta0    = delta-1;
+         Bool decode_OK = False;
+
+         /* If sz==2 this is SSE, and we assume sse idec has
+            already spotted those cases by now. */
+         if (sz != 4)
+            goto decode_failure;
+
+         delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
+         if (!decode_OK) {
+            delta = delta0;
+            goto decode_failure;
+         }
+         break;
+      }
+
+      case 0x77: /* EMMS */
+         if (sz != 4)
+            goto decode_failure;
+         do_EMMS_preamble();
+         DIP("emms\n");
+         break;
+
+      /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
+      case 0x01: /* 0F 01 /0 -- SGDT */
+                 /* 0F 01 /1 -- SIDT */
+      {
+          /* This is really revolting, but ... since each processor
+             (core) only has one IDT and one GDT, just let the guest
+             see it (pass-through semantics).  I can't see any way to
+             construct a faked-up value, so don't bother to try. */
+         modrm = getUChar(delta);
+         addr = disAMode ( &alen, sorb, delta, dis_buf );
+         delta += alen;
+         if (epartIsReg(modrm)) goto decode_failure;
+         if (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)
+            goto decode_failure;
+         switch (gregOfRM(modrm)) {
+            case 0: DIP("sgdt %s\n", dis_buf); break;
+            case 1: DIP("sidt %s\n", dis_buf); break;
+            default: vassert(0); /*NOTREACHED*/
+         }
+
+         IRDirty* d = unsafeIRDirty_0_N (
+                          0/*regparms*/,
+                          "x86g_dirtyhelper_SxDT",
+                          &x86g_dirtyhelper_SxDT,
+                          mkIRExprVec_2( mkexpr(addr),
+                                         mkU32(gregOfRM(modrm)) )
+                      );
+         /* declare we're writing memory */
+         d->mFx   = Ifx_Write;
+         d->mAddr = mkexpr(addr);
+         d->mSize = 6;
+         stmt( IRStmt_Dirty(d) );
+         break;
+      }
+
+      /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
+
+      default:
+         goto decode_failure;
+   } /* switch (opc) for the 2-byte opcodes */
+   goto decode_success;
+   } /* case 0x0F: of primary opcode */
+
+   /* ------------------------ ??? ------------------------ */
+  
+  default:
+  decode_failure:
+   /* All decode failures end up here. */
+   vex_printf("vex x86->IR: unhandled instruction bytes: "
+              "0x%x 0x%x 0x%x 0x%x\n",
+              (Int)getIByte(delta_start+0),
+              (Int)getIByte(delta_start+1),
+              (Int)getIByte(delta_start+2),
+              (Int)getIByte(delta_start+3) );
+
+   /* Tell the dispatcher that this insn cannot be decoded, and so has
+      not been executed, and (is currently) the next to be executed.
+      EIP should be up-to-date since it made so at the start of each
+      insn, but nevertheless be paranoid and update it again right
+      now. */
+   stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
+   jmp_lit(Ijk_NoDecode, guest_EIP_curr_instr);
+   dres.whatNext = Dis_StopHere;
+   dres.len = 0;
+   /* We also need to say that a CAS is not expected now, regardless
+      of what it might have been set to at the start of the function,
+      since the IR that we've emitted just above (to synthesis a
+      SIGILL) does not involve any CAS, and presumably no other IR has
+      been emitted for this (non-decoded) insn. */
+   *expect_CAS = False;
+   return dres;
+
+   } /* switch (opc) for the main (primary) opcode switch. */
+
+  decode_success:
+   /* All decode successes end up here. */
+   DIP("\n");
+   dres.len = delta - delta_start;
+   return dres;
+}
+
+#undef DIP
+#undef DIS
+
+
+/*------------------------------------------------------------*/
+/*--- Top-level fn                                         ---*/
+/*------------------------------------------------------------*/
+
+/* Disassemble a single instruction into IR.  The instruction
+   is located in host memory at &guest_code[delta]. */
+
+DisResult disInstr_X86 ( IRSB*        irsb_IN,
+                         Bool         put_IP,
+                         Bool         (*resteerOkFn) ( void*, Addr64 ),
+                         Bool         resteerCisOk,
+                         void*        callback_opaque,
+                         UChar*       guest_code_IN,
+                         Long         delta,
+                         Addr64       guest_IP,
+                         VexArch      guest_arch,
+                         VexArchInfo* archinfo,
+                         VexAbiInfo*  abiinfo,
+                         Bool         host_bigendian_IN )
+{
+   Int       i, x1, x2;
+   Bool      expect_CAS, has_CAS;
+   DisResult dres;
+
+   /* Set globals (see top of this file) */
+   vassert(guest_arch == VexArchX86);
+   guest_code           = guest_code_IN;
+   irsb                 = irsb_IN;
+   host_is_bigendian    = host_bigendian_IN;
+   guest_EIP_curr_instr = (Addr32)guest_IP;
+   guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
+
+   x1 = irsb_IN->stmts_used;
+   expect_CAS = False;
+   dres = disInstr_X86_WRK ( &expect_CAS, put_IP, resteerOkFn,
+                             resteerCisOk,
+                             callback_opaque,
+                             delta, archinfo, abiinfo );
+   x2 = irsb_IN->stmts_used;
+   vassert(x2 >= x1);
+
+   /* See comment at the top of disInstr_X86_WRK for meaning of
+      expect_CAS.  Here, we (sanity-)check for the presence/absence of
+      IRCAS as directed by the returned expect_CAS value. */
+   has_CAS = False;
+   for (i = x1; i < x2; i++) {
+      if (irsb_IN->stmts[i]->tag == Ist_CAS)
+         has_CAS = True;
+   }
+
+   if (expect_CAS != has_CAS) {
+      /* inconsistency detected.  re-disassemble the instruction so as
+         to generate a useful error message; then assert. */
+      vex_traceflags |= VEX_TRACE_FE;
+      dres = disInstr_X86_WRK ( &expect_CAS, put_IP, resteerOkFn,
+                                resteerCisOk,
+                                callback_opaque,
+                                delta, archinfo, abiinfo );
+      for (i = x1; i < x2; i++) {
+         vex_printf("\t\t");
+         ppIRStmt(irsb_IN->stmts[i]);
+         vex_printf("\n");
+      }
+      /* Failure of this assertion is serious and denotes a bug in
+         disInstr. */
+      vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
+   }
+
+   return dres;
+}
+
+
+/*--------------------------------------------------------------------*/
+/*--- end                                         guest_x86_toIR.c ---*/
+/*--------------------------------------------------------------------*/

diff --git a/VEX/priv/host_amd64_defs.c b/VEX/priv/host_amd64_defs.c
new file mode 100644
index 0000000..80bb6d8
--- /dev/null
+++ b/VEX/priv/host_amd64_defs.c

@@ -0,0 +1,3576 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                 host_amd64_defs.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex.h"
+#include "libvex_trc_values.h"
+
+#include "main_util.h"
+#include "host_generic_regs.h"
+#include "host_amd64_defs.h"
+
+
+/* --------- Registers. --------- */
+
+void ppHRegAMD64 ( HReg reg ) 
+{
+   Int r;
+   static HChar* ireg64_names[16] 
+     = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
+         "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
+   /* Be generic for all virtual regs. */
+   if (hregIsVirtual(reg)) {
+      ppHReg(reg);
+      return;
+   }
+   /* But specific for real regs. */
+   switch (hregClass(reg)) {
+      case HRcInt64:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 16);
+         vex_printf("%s", ireg64_names[r]);
+         return;
+      case HRcFlt64:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 6);
+         vex_printf("%%fake%d", r);
+         return;
+      case HRcVec128:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 16);
+         vex_printf("%%xmm%d", r);
+         return;
+      default:
+         vpanic("ppHRegAMD64");
+   }
+}
+
+static void ppHRegAMD64_lo32 ( HReg reg ) 
+{
+   Int r;
+   static HChar* ireg32_names[16] 
+     = { "%eax",  "%ecx",  "%edx",  "%ebx",  "%esp",  "%ebp",  "%esi",  "%edi",
+         "%r8d",  "%r9d",  "%r10d", "%r11d", "%r12d", "%r13d", "%r14d", "%r15d" };
+   /* Be generic for all virtual regs. */
+   if (hregIsVirtual(reg)) {
+      ppHReg(reg);
+      vex_printf("d");
+      return;
+   }
+   /* But specific for real regs. */
+   switch (hregClass(reg)) {
+      case HRcInt64:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 16);
+         vex_printf("%s", ireg32_names[r]);
+         return;
+      default:
+         vpanic("ppHRegAMD64_lo32: invalid regclass");
+   }
+}
+
+HReg hregAMD64_RAX ( void ) { return mkHReg( 0, HRcInt64, False); }
+HReg hregAMD64_RCX ( void ) { return mkHReg( 1, HRcInt64, False); }
+HReg hregAMD64_RDX ( void ) { return mkHReg( 2, HRcInt64, False); }
+HReg hregAMD64_RBX ( void ) { return mkHReg( 3, HRcInt64, False); }
+HReg hregAMD64_RSP ( void ) { return mkHReg( 4, HRcInt64, False); }
+HReg hregAMD64_RBP ( void ) { return mkHReg( 5, HRcInt64, False); }
+HReg hregAMD64_RSI ( void ) { return mkHReg( 6, HRcInt64, False); }
+HReg hregAMD64_RDI ( void ) { return mkHReg( 7, HRcInt64, False); }
+HReg hregAMD64_R8  ( void ) { return mkHReg( 8, HRcInt64, False); }
+HReg hregAMD64_R9  ( void ) { return mkHReg( 9, HRcInt64, False); }
+HReg hregAMD64_R10 ( void ) { return mkHReg(10, HRcInt64, False); }
+HReg hregAMD64_R11 ( void ) { return mkHReg(11, HRcInt64, False); }
+HReg hregAMD64_R12 ( void ) { return mkHReg(12, HRcInt64, False); }
+HReg hregAMD64_R13 ( void ) { return mkHReg(13, HRcInt64, False); }
+HReg hregAMD64_R14 ( void ) { return mkHReg(14, HRcInt64, False); }
+HReg hregAMD64_R15 ( void ) { return mkHReg(15, HRcInt64, False); }
+
+//.. HReg hregAMD64_FAKE0 ( void ) { return mkHReg(0, HRcFlt64, False); }
+//.. HReg hregAMD64_FAKE1 ( void ) { return mkHReg(1, HRcFlt64, False); }
+//.. HReg hregAMD64_FAKE2 ( void ) { return mkHReg(2, HRcFlt64, False); }
+//.. HReg hregAMD64_FAKE3 ( void ) { return mkHReg(3, HRcFlt64, False); }
+//.. HReg hregAMD64_FAKE4 ( void ) { return mkHReg(4, HRcFlt64, False); }
+//.. HReg hregAMD64_FAKE5 ( void ) { return mkHReg(5, HRcFlt64, False); }
+//.. 
+HReg hregAMD64_XMM0  ( void ) { return mkHReg( 0, HRcVec128, False); }
+HReg hregAMD64_XMM1  ( void ) { return mkHReg( 1, HRcVec128, False); }
+HReg hregAMD64_XMM2  ( void ) { return mkHReg( 2, HRcVec128, False); }
+HReg hregAMD64_XMM3  ( void ) { return mkHReg( 3, HRcVec128, False); }
+HReg hregAMD64_XMM4  ( void ) { return mkHReg( 4, HRcVec128, False); }
+HReg hregAMD64_XMM5  ( void ) { return mkHReg( 5, HRcVec128, False); }
+HReg hregAMD64_XMM6  ( void ) { return mkHReg( 6, HRcVec128, False); }
+HReg hregAMD64_XMM7  ( void ) { return mkHReg( 7, HRcVec128, False); }
+HReg hregAMD64_XMM8  ( void ) { return mkHReg( 8, HRcVec128, False); }
+HReg hregAMD64_XMM9  ( void ) { return mkHReg( 9, HRcVec128, False); }
+HReg hregAMD64_XMM10 ( void ) { return mkHReg(10, HRcVec128, False); }
+HReg hregAMD64_XMM11 ( void ) { return mkHReg(11, HRcVec128, False); }
+HReg hregAMD64_XMM12 ( void ) { return mkHReg(12, HRcVec128, False); }
+HReg hregAMD64_XMM13 ( void ) { return mkHReg(13, HRcVec128, False); }
+HReg hregAMD64_XMM14 ( void ) { return mkHReg(14, HRcVec128, False); }
+HReg hregAMD64_XMM15 ( void ) { return mkHReg(15, HRcVec128, False); }
+
+
+void getAllocableRegs_AMD64 ( Int* nregs, HReg** arr )
+{
+#if 0
+   *nregs = 6;
+   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
+   (*arr)[ 0] = hregAMD64_RSI();
+   (*arr)[ 1] = hregAMD64_RDI();
+   (*arr)[ 2] = hregAMD64_RBX();
+
+   (*arr)[ 3] = hregAMD64_XMM7();
+   (*arr)[ 4] = hregAMD64_XMM8();
+   (*arr)[ 5] = hregAMD64_XMM9();
+#endif
+#if 1
+   *nregs = 20;
+   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
+   (*arr)[ 0] = hregAMD64_RSI();
+   (*arr)[ 1] = hregAMD64_RDI();
+   (*arr)[ 2] = hregAMD64_R8();
+   (*arr)[ 3] = hregAMD64_R9();
+   (*arr)[ 4] = hregAMD64_R12();
+   (*arr)[ 5] = hregAMD64_R13();
+   (*arr)[ 6] = hregAMD64_R14();
+   (*arr)[ 7] = hregAMD64_R15();
+   (*arr)[ 8] = hregAMD64_RBX();
+
+   (*arr)[ 9] = hregAMD64_XMM3();
+   (*arr)[10] = hregAMD64_XMM4();
+   (*arr)[11] = hregAMD64_XMM5();
+   (*arr)[12] = hregAMD64_XMM6();
+   (*arr)[13] = hregAMD64_XMM7();
+   (*arr)[14] = hregAMD64_XMM8();
+   (*arr)[15] = hregAMD64_XMM9();
+   (*arr)[16] = hregAMD64_XMM10();
+   (*arr)[17] = hregAMD64_XMM11();
+   (*arr)[18] = hregAMD64_XMM12();
+   (*arr)[19] = hregAMD64_R10();
+#endif
+}
+
+
+/* --------- Condition codes, Intel encoding. --------- */
+
+HChar* showAMD64CondCode ( AMD64CondCode cond )
+{
+   switch (cond) {
+      case Acc_O:      return "o";
+      case Acc_NO:     return "no";
+      case Acc_B:      return "b";
+      case Acc_NB:     return "nb";
+      case Acc_Z:      return "z";
+      case Acc_NZ:     return "nz";
+      case Acc_BE:     return "be";
+      case Acc_NBE:    return "nbe";
+      case Acc_S:      return "s";
+      case Acc_NS:     return "ns";
+      case Acc_P:      return "p";
+      case Acc_NP:     return "np";
+      case Acc_L:      return "l";
+      case Acc_NL:     return "nl";
+      case Acc_LE:     return "le";
+      case Acc_NLE:    return "nle";
+      case Acc_ALWAYS: return "ALWAYS";
+      default: vpanic("ppAMD64CondCode");
+   }
+}
+
+
+/* --------- AMD64AMode: memory address expressions. --------- */
+
+AMD64AMode* AMD64AMode_IR ( UInt imm32, HReg reg ) {
+   AMD64AMode* am = LibVEX_Alloc(sizeof(AMD64AMode));
+   am->tag        = Aam_IR;
+   am->Aam.IR.imm = imm32;
+   am->Aam.IR.reg = reg;
+   return am;
+}
+AMD64AMode* AMD64AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
+   AMD64AMode* am = LibVEX_Alloc(sizeof(AMD64AMode));
+   am->tag = Aam_IRRS;
+   am->Aam.IRRS.imm   = imm32;
+   am->Aam.IRRS.base  = base;
+   am->Aam.IRRS.index = indEx;
+   am->Aam.IRRS.shift = shift;
+   vassert(shift >= 0 && shift <= 3);
+   return am;
+}
+
+//.. AMD64AMode* dopyAMD64AMode ( AMD64AMode* am ) {
+//..    switch (am->tag) {
+//..       case Xam_IR: 
+//..          return AMD64AMode_IR( am->Xam.IR.imm, am->Xam.IR.reg );
+//..       case Xam_IRRS: 
+//..          return AMD64AMode_IRRS( am->Xam.IRRS.imm, am->Xam.IRRS.base, 
+//..                                am->Xam.IRRS.index, am->Xam.IRRS.shift );
+//..       default:
+//..          vpanic("dopyAMD64AMode");
+//..    }
+//.. }
+
+void ppAMD64AMode ( AMD64AMode* am ) {
+   switch (am->tag) {
+      case Aam_IR: 
+         if (am->Aam.IR.imm == 0)
+            vex_printf("(");
+         else
+            vex_printf("0x%x(", am->Aam.IR.imm);
+         ppHRegAMD64(am->Aam.IR.reg);
+         vex_printf(")");
+         return;
+      case Aam_IRRS:
+         vex_printf("0x%x(", am->Aam.IRRS.imm);
+         ppHRegAMD64(am->Aam.IRRS.base);
+         vex_printf(",");
+         ppHRegAMD64(am->Aam.IRRS.index);
+         vex_printf(",%d)", 1 << am->Aam.IRRS.shift);
+         return;
+      default:
+         vpanic("ppAMD64AMode");
+   }
+}
+
+static void addRegUsage_AMD64AMode ( HRegUsage* u, AMD64AMode* am ) {
+   switch (am->tag) {
+      case Aam_IR: 
+         addHRegUse(u, HRmRead, am->Aam.IR.reg);
+         return;
+      case Aam_IRRS:
+         addHRegUse(u, HRmRead, am->Aam.IRRS.base);
+         addHRegUse(u, HRmRead, am->Aam.IRRS.index);
+         return;
+      default:
+         vpanic("addRegUsage_AMD64AMode");
+   }
+}
+
+static void mapRegs_AMD64AMode ( HRegRemap* m, AMD64AMode* am ) {
+   switch (am->tag) {
+      case Aam_IR: 
+         am->Aam.IR.reg = lookupHRegRemap(m, am->Aam.IR.reg);
+         return;
+      case Aam_IRRS:
+         am->Aam.IRRS.base = lookupHRegRemap(m, am->Aam.IRRS.base);
+         am->Aam.IRRS.index = lookupHRegRemap(m, am->Aam.IRRS.index);
+         return;
+      default:
+         vpanic("mapRegs_AMD64AMode");
+   }
+}
+
+/* --------- Operand, which can be reg, immediate or memory. --------- */
+
+AMD64RMI* AMD64RMI_Imm ( UInt imm32 ) {
+   AMD64RMI* op       = LibVEX_Alloc(sizeof(AMD64RMI));
+   op->tag            = Armi_Imm;
+   op->Armi.Imm.imm32 = imm32;
+   return op;
+}
+AMD64RMI* AMD64RMI_Reg ( HReg reg ) {
+   AMD64RMI* op     = LibVEX_Alloc(sizeof(AMD64RMI));
+   op->tag          = Armi_Reg;
+   op->Armi.Reg.reg = reg;
+   return op;
+}
+AMD64RMI* AMD64RMI_Mem ( AMD64AMode* am ) {
+   AMD64RMI* op    = LibVEX_Alloc(sizeof(AMD64RMI));
+   op->tag         = Armi_Mem;
+   op->Armi.Mem.am = am;
+   return op;
+}
+
+void ppAMD64RMI ( AMD64RMI* op ) {
+   switch (op->tag) {
+      case Armi_Imm: 
+         vex_printf("$0x%x", op->Armi.Imm.imm32);
+         return;
+      case Armi_Reg: 
+         ppHRegAMD64(op->Armi.Reg.reg);
+         return;
+      case Armi_Mem: 
+         ppAMD64AMode(op->Armi.Mem.am);
+         return;
+     default: 
+         vpanic("ppAMD64RMI");
+   }
+}
+
+/* An AMD64RMI can only be used in a "read" context (what would it mean
+   to write or modify a literal?) and so we enumerate its registers
+   accordingly. */
+static void addRegUsage_AMD64RMI ( HRegUsage* u, AMD64RMI* op ) {
+   switch (op->tag) {
+      case Armi_Imm: 
+         return;
+      case Armi_Reg: 
+         addHRegUse(u, HRmRead, op->Armi.Reg.reg);
+         return;
+      case Armi_Mem: 
+         addRegUsage_AMD64AMode(u, op->Armi.Mem.am);
+         return;
+      default: 
+         vpanic("addRegUsage_AMD64RMI");
+   }
+}
+
+static void mapRegs_AMD64RMI ( HRegRemap* m, AMD64RMI* op ) {
+   switch (op->tag) {
+      case Armi_Imm: 
+         return;
+      case Armi_Reg: 
+         op->Armi.Reg.reg = lookupHRegRemap(m, op->Armi.Reg.reg);
+         return;
+      case Armi_Mem: 
+         mapRegs_AMD64AMode(m, op->Armi.Mem.am);
+         return;
+      default: 
+         vpanic("mapRegs_AMD64RMI");
+   }
+}
+
+
+/* --------- Operand, which can be reg or immediate only. --------- */
+
+AMD64RI* AMD64RI_Imm ( UInt imm32 ) {
+   AMD64RI* op       = LibVEX_Alloc(sizeof(AMD64RI));
+   op->tag           = Ari_Imm;
+   op->Ari.Imm.imm32 = imm32;
+   return op;
+}
+AMD64RI* AMD64RI_Reg ( HReg reg ) {
+   AMD64RI* op     = LibVEX_Alloc(sizeof(AMD64RI));
+   op->tag         = Ari_Reg;
+   op->Ari.Reg.reg = reg;
+   return op;
+}
+
+void ppAMD64RI ( AMD64RI* op ) {
+   switch (op->tag) {
+      case Ari_Imm: 
+         vex_printf("$0x%x", op->Ari.Imm.imm32);
+         return;
+      case Ari_Reg: 
+         ppHRegAMD64(op->Ari.Reg.reg);
+         return;
+     default: 
+         vpanic("ppAMD64RI");
+   }
+}
+
+/* An AMD64RI can only be used in a "read" context (what would it mean
+   to write or modify a literal?) and so we enumerate its registers
+   accordingly. */
+static void addRegUsage_AMD64RI ( HRegUsage* u, AMD64RI* op ) {
+   switch (op->tag) {
+      case Ari_Imm: 
+         return;
+      case Ari_Reg: 
+         addHRegUse(u, HRmRead, op->Ari.Reg.reg);
+         return;
+      default: 
+         vpanic("addRegUsage_AMD64RI");
+   }
+}
+
+static void mapRegs_AMD64RI ( HRegRemap* m, AMD64RI* op ) {
+   switch (op->tag) {
+      case Ari_Imm: 
+         return;
+      case Ari_Reg: 
+         op->Ari.Reg.reg = lookupHRegRemap(m, op->Ari.Reg.reg);
+         return;
+      default: 
+         vpanic("mapRegs_AMD64RI");
+   }
+}
+
+
+/* --------- Operand, which can be reg or memory only. --------- */
+
+AMD64RM* AMD64RM_Reg ( HReg reg ) {
+   AMD64RM* op       = LibVEX_Alloc(sizeof(AMD64RM));
+   op->tag         = Arm_Reg;
+   op->Arm.Reg.reg = reg;
+   return op;
+}
+AMD64RM* AMD64RM_Mem ( AMD64AMode* am ) {
+   AMD64RM* op    = LibVEX_Alloc(sizeof(AMD64RM));
+   op->tag        = Arm_Mem;
+   op->Arm.Mem.am = am;
+   return op;
+}
+
+void ppAMD64RM ( AMD64RM* op ) {
+   switch (op->tag) {
+      case Arm_Mem: 
+         ppAMD64AMode(op->Arm.Mem.am);
+         return;
+      case Arm_Reg: 
+         ppHRegAMD64(op->Arm.Reg.reg);
+         return;
+     default: 
+         vpanic("ppAMD64RM");
+   }
+}
+
+/* Because an AMD64RM can be both a source or destination operand, we
+   have to supply a mode -- pertaining to the operand as a whole --
+   indicating how it's being used. */
+static void addRegUsage_AMD64RM ( HRegUsage* u, AMD64RM* op, HRegMode mode ) {
+   switch (op->tag) {
+      case Arm_Mem: 
+         /* Memory is read, written or modified.  So we just want to
+            know the regs read by the amode. */
+         addRegUsage_AMD64AMode(u, op->Arm.Mem.am);
+         return;
+      case Arm_Reg: 
+         /* reg is read, written or modified.  Add it in the
+            appropriate way. */
+         addHRegUse(u, mode, op->Arm.Reg.reg);
+         return;
+     default: 
+         vpanic("addRegUsage_AMD64RM");
+   }
+}
+
+static void mapRegs_AMD64RM ( HRegRemap* m, AMD64RM* op )
+{
+   switch (op->tag) {
+      case Arm_Mem: 
+         mapRegs_AMD64AMode(m, op->Arm.Mem.am);
+         return;
+      case Arm_Reg: 
+         op->Arm.Reg.reg = lookupHRegRemap(m, op->Arm.Reg.reg);
+         return;
+     default: 
+         vpanic("mapRegs_AMD64RM");
+   }
+}
+
+
+/* --------- Instructions. --------- */
+
+static HChar* showAMD64ScalarSz ( Int sz ) {
+   switch (sz) {
+      case 2: return "w";
+      case 4: return "l";
+      case 8: return "q";
+      default: vpanic("showAMD64ScalarSz");
+   }
+}
+ 
+HChar* showAMD64UnaryOp ( AMD64UnaryOp op ) {
+   switch (op) {
+      case Aun_NOT: return "not";
+      case Aun_NEG: return "neg";
+      default: vpanic("showAMD64UnaryOp");
+   }
+}
+
+HChar* showAMD64AluOp ( AMD64AluOp op ) {
+   switch (op) {
+      case Aalu_MOV:  return "mov";
+      case Aalu_CMP:  return "cmp";
+      case Aalu_ADD:  return "add";
+      case Aalu_SUB:  return "sub";
+      case Aalu_ADC:  return "adc";
+      case Aalu_SBB:  return "sbb";
+      case Aalu_AND:  return "and";
+      case Aalu_OR:   return "or";
+      case Aalu_XOR:  return "xor";
+      case Aalu_MUL:  return "imul";
+      default: vpanic("showAMD64AluOp");
+   }
+}
+
+HChar* showAMD64ShiftOp ( AMD64ShiftOp op ) {
+   switch (op) {
+      case Ash_SHL: return "shl";
+      case Ash_SHR: return "shr";
+      case Ash_SAR: return "sar";
+      default: vpanic("showAMD64ShiftOp");
+   }
+}
+
+HChar* showA87FpOp ( A87FpOp op ) {
+   switch (op) {
+//..       case Xfp_ADD:    return "add";
+//..       case Xfp_SUB:    return "sub";
+//..       case Xfp_MUL:    return "mul";
+//..       case Xfp_DIV:    return "div";
+      case Afp_SCALE:  return "scale";
+      case Afp_ATAN:   return "atan";
+      case Afp_YL2X:   return "yl2x";
+      case Afp_YL2XP1: return "yl2xp1";
+      case Afp_PREM:   return "prem";
+      case Afp_PREM1:  return "prem1";
+      case Afp_SQRT:   return "sqrt";
+//..       case Xfp_ABS:    return "abs";
+//..       case Xfp_NEG:    return "chs";
+//..       case Xfp_MOV:    return "mov";
+      case Afp_SIN:    return "sin";
+      case Afp_COS:    return "cos";
+      case Afp_TAN:    return "tan";
+      case Afp_ROUND:  return "round";
+      case Afp_2XM1:   return "2xm1";
+      default: vpanic("showA87FpOp");
+   }
+}
+
+HChar* showAMD64SseOp ( AMD64SseOp op ) {
+   switch (op) {
+      case Asse_MOV:      return "movups";
+      case Asse_ADDF:     return "add";
+      case Asse_SUBF:     return "sub";
+      case Asse_MULF:     return "mul";
+      case Asse_DIVF:     return "div";
+      case Asse_MAXF:     return "max";
+      case Asse_MINF:     return "min";
+      case Asse_CMPEQF:   return "cmpFeq";
+      case Asse_CMPLTF:   return "cmpFlt";
+      case Asse_CMPLEF:   return "cmpFle";
+      case Asse_CMPUNF:   return "cmpFun";
+      case Asse_RCPF:     return "rcp";
+      case Asse_RSQRTF:   return "rsqrt";
+      case Asse_SQRTF:    return "sqrt";
+      case Asse_AND:      return "and";
+      case Asse_OR:       return "or";
+      case Asse_XOR:      return "xor";
+      case Asse_ANDN:     return "andn";
+      case Asse_ADD8:     return "paddb";
+      case Asse_ADD16:    return "paddw";
+      case Asse_ADD32:    return "paddd";
+      case Asse_ADD64:    return "paddq";
+      case Asse_QADD8U:   return "paddusb";
+      case Asse_QADD16U:  return "paddusw";
+      case Asse_QADD8S:   return "paddsb";
+      case Asse_QADD16S:  return "paddsw";
+      case Asse_SUB8:     return "psubb";
+      case Asse_SUB16:    return "psubw";
+      case Asse_SUB32:    return "psubd";
+      case Asse_SUB64:    return "psubq";
+      case Asse_QSUB8U:   return "psubusb";
+      case Asse_QSUB16U:  return "psubusw";
+      case Asse_QSUB8S:   return "psubsb";
+      case Asse_QSUB16S:  return "psubsw";
+      case Asse_MUL16:    return "pmullw";
+      case Asse_MULHI16U: return "pmulhuw";
+      case Asse_MULHI16S: return "pmulhw";
+      case Asse_AVG8U:    return "pavgb";
+      case Asse_AVG16U:   return "pavgw";
+      case Asse_MAX16S:   return "pmaxw";
+      case Asse_MAX8U:    return "pmaxub";
+      case Asse_MIN16S:   return "pminw";
+      case Asse_MIN8U:    return "pminub";
+      case Asse_CMPEQ8:   return "pcmpeqb";
+      case Asse_CMPEQ16:  return "pcmpeqw";
+      case Asse_CMPEQ32:  return "pcmpeqd";
+      case Asse_CMPGT8S:  return "pcmpgtb";
+      case Asse_CMPGT16S: return "pcmpgtw";
+      case Asse_CMPGT32S: return "pcmpgtd";
+      case Asse_SHL16:    return "psllw";
+      case Asse_SHL32:    return "pslld";
+      case Asse_SHL64:    return "psllq";
+      case Asse_SHR16:    return "psrlw";
+      case Asse_SHR32:    return "psrld";
+      case Asse_SHR64:    return "psrlq";
+      case Asse_SAR16:    return "psraw";
+      case Asse_SAR32:    return "psrad";
+      case Asse_PACKSSD:  return "packssdw";
+      case Asse_PACKSSW:  return "packsswb";
+      case Asse_PACKUSW:  return "packuswb";
+      case Asse_UNPCKHB:  return "punpckhb";
+      case Asse_UNPCKHW:  return "punpckhw";
+      case Asse_UNPCKHD:  return "punpckhd";
+      case Asse_UNPCKHQ:  return "punpckhq";
+      case Asse_UNPCKLB:  return "punpcklb";
+      case Asse_UNPCKLW:  return "punpcklw";
+      case Asse_UNPCKLD:  return "punpckld";
+      case Asse_UNPCKLQ:  return "punpcklq";
+      default: vpanic("showAMD64SseOp");
+   }
+}
+
+AMD64Instr* AMD64Instr_Imm64 ( ULong imm64, HReg dst ) {
+   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag             = Ain_Imm64;
+   i->Ain.Imm64.imm64 = imm64;
+   i->Ain.Imm64.dst   = dst;
+   return i;
+}
+AMD64Instr* AMD64Instr_Alu64R ( AMD64AluOp op, AMD64RMI* src, HReg dst ) {
+   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag            = Ain_Alu64R;
+   i->Ain.Alu64R.op  = op;
+   i->Ain.Alu64R.src = src;
+   i->Ain.Alu64R.dst = dst;
+   return i;
+}
+AMD64Instr* AMD64Instr_Alu64M ( AMD64AluOp op, AMD64RI* src, AMD64AMode* dst ) {
+   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag            = Ain_Alu64M;
+   i->Ain.Alu64M.op  = op;
+   i->Ain.Alu64M.src = src;
+   i->Ain.Alu64M.dst = dst;
+   vassert(op != Aalu_MUL);
+   return i;
+}
+AMD64Instr* AMD64Instr_Sh64 ( AMD64ShiftOp op, UInt src, HReg dst ) {
+   AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag          = Ain_Sh64;
+   i->Ain.Sh64.op  = op;
+   i->Ain.Sh64.src = src;
+   i->Ain.Sh64.dst = dst;
+   return i;
+}
+AMD64Instr* AMD64Instr_Test64 ( UInt imm32, HReg dst ) {
+   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag              = Ain_Test64;
+   i->Ain.Test64.imm32 = imm32;
+   i->Ain.Test64.dst   = dst;
+   return i;
+}
+AMD64Instr* AMD64Instr_Unary64 ( AMD64UnaryOp op, HReg dst ) {
+   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag             = Ain_Unary64;
+   i->Ain.Unary64.op  = op;
+   i->Ain.Unary64.dst = dst;
+   return i;
+}
+AMD64Instr* AMD64Instr_Lea64 ( AMD64AMode* am, HReg dst ) {
+   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag             = Ain_Lea64;
+   i->Ain.Lea64.am    = am;
+   i->Ain.Lea64.dst   = dst;
+   return i;
+}
+AMD64Instr* AMD64Instr_MulL ( Bool syned, AMD64RM* src ) {
+   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag            = Ain_MulL;
+   i->Ain.MulL.syned = syned;
+   i->Ain.MulL.src   = src;
+   return i;
+}
+AMD64Instr* AMD64Instr_Div ( Bool syned, Int sz, AMD64RM* src ) {
+   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag            = Ain_Div;
+   i->Ain.Div.syned  = syned;
+   i->Ain.Div.sz     = sz;
+   i->Ain.Div.src    = src;
+   vassert(sz == 4 || sz == 8);
+   return i;
+}
+//.. AMD64Instr* AMD64Instr_Sh3232  ( AMD64ShiftOp op, UInt amt, HReg src, HReg dst ) {
+//..    AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+//..    i->tag            = Xin_Sh3232;
+//..    i->Xin.Sh3232.op  = op;
+//..    i->Xin.Sh3232.amt = amt;
+//..    i->Xin.Sh3232.src = src;
+//..    i->Xin.Sh3232.dst = dst;
+//..    vassert(op == Xsh_SHL || op == Xsh_SHR);
+//..    return i;
+//.. }
+AMD64Instr* AMD64Instr_Push( AMD64RMI* src ) {
+   AMD64Instr* i   = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag          = Ain_Push;
+   i->Ain.Push.src = src;
+   return i;
+}
+AMD64Instr* AMD64Instr_Call ( AMD64CondCode cond, Addr64 target, Int regparms ) {
+   AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag               = Ain_Call;
+   i->Ain.Call.cond     = cond;
+   i->Ain.Call.target   = target;
+   i->Ain.Call.regparms = regparms;
+   vassert(regparms >= 0 && regparms <= 6);
+   return i;
+}
+AMD64Instr* AMD64Instr_Goto ( IRJumpKind jk, AMD64CondCode cond, AMD64RI* dst ) {
+   AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag           = Ain_Goto;
+   i->Ain.Goto.cond = cond;
+   i->Ain.Goto.dst  = dst;
+   i->Ain.Goto.jk   = jk;
+   return i;
+}
+AMD64Instr* AMD64Instr_CMov64 ( AMD64CondCode cond, AMD64RM* src, HReg dst ) {
+   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag             = Ain_CMov64;
+   i->Ain.CMov64.cond = cond;
+   i->Ain.CMov64.src  = src;
+   i->Ain.CMov64.dst  = dst;
+   vassert(cond != Acc_ALWAYS);
+   return i;
+}
+AMD64Instr* AMD64Instr_MovxLQ ( Bool syned, HReg src, HReg dst ) {
+   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag              = Ain_MovxLQ;
+   i->Ain.MovxLQ.syned = syned;
+   i->Ain.MovxLQ.src   = src;
+   i->Ain.MovxLQ.dst   = dst;
+   return i;
+}
+AMD64Instr* AMD64Instr_LoadEX ( UChar szSmall, Bool syned,
+                                AMD64AMode* src, HReg dst ) {
+   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag                = Ain_LoadEX;
+   i->Ain.LoadEX.szSmall = szSmall;
+   i->Ain.LoadEX.syned   = syned;
+   i->Ain.LoadEX.src     = src;
+   i->Ain.LoadEX.dst     = dst;
+   vassert(szSmall == 1 || szSmall == 2 || szSmall == 4);
+   return i;
+}
+AMD64Instr* AMD64Instr_Store ( UChar sz, HReg src, AMD64AMode* dst ) {
+   AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag           = Ain_Store;
+   i->Ain.Store.sz  = sz;
+   i->Ain.Store.src = src;
+   i->Ain.Store.dst = dst;
+   vassert(sz == 1 || sz == 2 || sz == 4);
+   return i;
+}
+AMD64Instr* AMD64Instr_Set64 ( AMD64CondCode cond, HReg dst ) {
+   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag            = Ain_Set64;
+   i->Ain.Set64.cond = cond;
+   i->Ain.Set64.dst  = dst;
+   return i;
+}
+AMD64Instr* AMD64Instr_Bsfr64 ( Bool isFwds, HReg src, HReg dst ) {
+   AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag               = Ain_Bsfr64;
+   i->Ain.Bsfr64.isFwds = isFwds;
+   i->Ain.Bsfr64.src    = src;
+   i->Ain.Bsfr64.dst    = dst;
+   return i;
+}
+AMD64Instr* AMD64Instr_MFence ( void ) {
+   AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag        = Ain_MFence;
+   return i;
+}
+AMD64Instr* AMD64Instr_ACAS ( AMD64AMode* addr, UChar sz ) {
+   AMD64Instr* i    = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag           = Ain_ACAS;
+   i->Ain.ACAS.addr = addr;
+   i->Ain.ACAS.sz   = sz;
+   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
+   return i;
+}
+AMD64Instr* AMD64Instr_DACAS ( AMD64AMode* addr, UChar sz ) {
+   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag            = Ain_DACAS;
+   i->Ain.DACAS.addr = addr;
+   i->Ain.DACAS.sz   = sz;
+   vassert(sz == 8 || sz == 4);
+   return i;
+}
+
+AMD64Instr* AMD64Instr_A87Free ( Int nregs )
+{
+   AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag               = Ain_A87Free;
+   i->Ain.A87Free.nregs = nregs;
+   vassert(nregs >= 1 && nregs <= 7);
+   return i;
+}
+AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB )
+{
+   AMD64Instr* i            = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag                   = Ain_A87PushPop;
+   i->Ain.A87PushPop.addr   = addr;
+   i->Ain.A87PushPop.isPush = isPush;
+   i->Ain.A87PushPop.szB    = szB;
+   vassert(szB == 8 || szB == 4);
+   return i;
+}
+AMD64Instr* AMD64Instr_A87FpOp ( A87FpOp op )
+{
+   AMD64Instr* i     = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag            = Ain_A87FpOp;
+   i->Ain.A87FpOp.op = op;
+   return i;
+}
+AMD64Instr* AMD64Instr_A87LdCW ( AMD64AMode* addr )
+{
+   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag              = Ain_A87LdCW;
+   i->Ain.A87LdCW.addr = addr;
+   return i;
+}
+AMD64Instr* AMD64Instr_A87StSW ( AMD64AMode* addr )
+{
+   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag              = Ain_A87StSW;
+   i->Ain.A87StSW.addr = addr;
+   return i;
+}
+
+//.. AMD64Instr* AMD64Instr_FpUnary ( AMD64FpOp op, HReg src, HReg dst ) {
+//..    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
+//..    i->tag             = Xin_FpUnary;
+//..    i->Xin.FpUnary.op  = op;
+//..    i->Xin.FpUnary.src = src;
+//..    i->Xin.FpUnary.dst = dst;
+//..    return i;
+//.. }
+//.. AMD64Instr* AMD64Instr_FpBinary ( AMD64FpOp op, HReg srcL, HReg srcR, HReg dst ) {
+//..    AMD64Instr* i          = LibVEX_Alloc(sizeof(AMD64Instr));
+//..    i->tag               = Xin_FpBinary;
+//..    i->Xin.FpBinary.op   = op;
+//..    i->Xin.FpBinary.srcL = srcL;
+//..    i->Xin.FpBinary.srcR = srcR;
+//..    i->Xin.FpBinary.dst  = dst;
+//..    return i;
+//.. }
+//.. AMD64Instr* AMD64Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, AMD64AMode* addr ) {
+//..    AMD64Instr* i          = LibVEX_Alloc(sizeof(AMD64Instr));
+//..    i->tag               = Xin_FpLdSt;
+//..    i->Xin.FpLdSt.isLoad = isLoad;
+//..    i->Xin.FpLdSt.sz     = sz;
+//..    i->Xin.FpLdSt.reg    = reg;
+//..    i->Xin.FpLdSt.addr   = addr;
+//..    vassert(sz == 4 || sz == 8);
+//..    return i;
+//.. }
+//.. AMD64Instr* AMD64Instr_FpLdStI ( Bool isLoad, UChar sz,  
+//..                              HReg reg, AMD64AMode* addr ) {
+//..    AMD64Instr* i           = LibVEX_Alloc(sizeof(AMD64Instr));
+//..    i->tag                = Xin_FpLdStI;
+//..    i->Xin.FpLdStI.isLoad = isLoad;
+//..    i->Xin.FpLdStI.sz     = sz;
+//..    i->Xin.FpLdStI.reg    = reg;
+//..    i->Xin.FpLdStI.addr   = addr;
+//..    vassert(sz == 2 || sz == 4 || sz == 8);
+//..    return i;
+//.. }
+//.. AMD64Instr* AMD64Instr_Fp64to32 ( HReg src, HReg dst ) {
+//..    AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
+//..    i->tag              = Xin_Fp64to32;
+//..    i->Xin.Fp64to32.src = src;
+//..    i->Xin.Fp64to32.dst = dst;
+//..    return i;
+//.. }
+//.. AMD64Instr* AMD64Instr_FpCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
+//..    AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
+//..    i->tag             = Xin_FpCMov;
+//..    i->Xin.FpCMov.cond = cond;
+//..    i->Xin.FpCMov.src  = src;
+//..    i->Xin.FpCMov.dst  = dst;
+//..    vassert(cond != Xcc_ALWAYS);
+//..    return i;
+//.. }
+AMD64Instr* AMD64Instr_LdMXCSR ( AMD64AMode* addr ) {
+   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag                = Ain_LdMXCSR;
+   i->Ain.LdMXCSR.addr   = addr;
+   return i;
+}
+//.. AMD64Instr* AMD64Instr_FpStSW_AX ( void ) {
+//..    AMD64Instr* i = LibVEX_Alloc(sizeof(AMD64Instr));
+//..    i->tag      = Xin_FpStSW_AX;
+//..    return i;
+//.. }
+AMD64Instr* AMD64Instr_SseUComIS ( Int sz, HReg srcL, HReg srcR, HReg dst ) {
+   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag                = Ain_SseUComIS;
+   i->Ain.SseUComIS.sz   = toUChar(sz);
+   i->Ain.SseUComIS.srcL = srcL;
+   i->Ain.SseUComIS.srcR = srcR;
+   i->Ain.SseUComIS.dst  = dst;
+   vassert(sz == 4 || sz == 8);
+   return i;
+}
+AMD64Instr* AMD64Instr_SseSI2SF ( Int szS, Int szD, HReg src, HReg dst ) {
+   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag              = Ain_SseSI2SF;
+   i->Ain.SseSI2SF.szS = toUChar(szS);
+   i->Ain.SseSI2SF.szD = toUChar(szD);
+   i->Ain.SseSI2SF.src = src;
+   i->Ain.SseSI2SF.dst = dst;
+   vassert(szS == 4 || szS == 8);
+   vassert(szD == 4 || szD == 8);
+   return i;
+}
+AMD64Instr* AMD64Instr_SseSF2SI ( Int szS, Int szD, HReg src, HReg dst ) {
+   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag              = Ain_SseSF2SI;
+   i->Ain.SseSF2SI.szS = toUChar(szS);
+   i->Ain.SseSF2SI.szD = toUChar(szD);
+   i->Ain.SseSF2SI.src = src;
+   i->Ain.SseSF2SI.dst = dst;
+   vassert(szS == 4 || szS == 8);
+   vassert(szD == 4 || szD == 8);
+   return i;
+}
+AMD64Instr* AMD64Instr_SseSDSS   ( Bool from64, HReg src, HReg dst )
+{
+   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag                = Ain_SseSDSS;
+   i->Ain.SseSDSS.from64 = from64;
+   i->Ain.SseSDSS.src    = src;
+   i->Ain.SseSDSS.dst    = dst;
+   return i;
+}
+
+//.. AMD64Instr* AMD64Instr_SseConst ( UShort con, HReg dst ) {
+//..    AMD64Instr* i            = LibVEX_Alloc(sizeof(AMD64Instr));
+//..    i->tag                 = Xin_SseConst;
+//..    i->Xin.SseConst.con    = con;
+//..    i->Xin.SseConst.dst    = dst;
+//..    vassert(hregClass(dst) == HRcVec128);
+//..    return i;
+//.. }
+AMD64Instr* AMD64Instr_SseLdSt ( Bool isLoad, Int sz, 
+                                 HReg reg, AMD64AMode* addr ) {
+   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag                = Ain_SseLdSt;
+   i->Ain.SseLdSt.isLoad = isLoad;
+   i->Ain.SseLdSt.sz     = toUChar(sz);
+   i->Ain.SseLdSt.reg    = reg;
+   i->Ain.SseLdSt.addr   = addr;
+   vassert(sz == 4 || sz == 8 || sz == 16);
+   return i;
+}
+AMD64Instr* AMD64Instr_SseLdzLO  ( Int sz, HReg reg, AMD64AMode* addr )
+{
+   AMD64Instr* i         = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag                = Ain_SseLdzLO;
+   i->Ain.SseLdzLO.sz    = sz;
+   i->Ain.SseLdzLO.reg   = reg;
+   i->Ain.SseLdzLO.addr  = addr;
+   vassert(sz == 4 || sz == 8);
+   return i;
+}
+AMD64Instr* AMD64Instr_Sse32Fx4 ( AMD64SseOp op, HReg src, HReg dst ) {
+   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag              = Ain_Sse32Fx4;
+   i->Ain.Sse32Fx4.op  = op;
+   i->Ain.Sse32Fx4.src = src;
+   i->Ain.Sse32Fx4.dst = dst;
+   vassert(op != Asse_MOV);
+   return i;
+}
+AMD64Instr* AMD64Instr_Sse32FLo ( AMD64SseOp op, HReg src, HReg dst ) {
+   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag              = Ain_Sse32FLo;
+   i->Ain.Sse32FLo.op  = op;
+   i->Ain.Sse32FLo.src = src;
+   i->Ain.Sse32FLo.dst = dst;
+   vassert(op != Asse_MOV);
+   return i;
+}
+AMD64Instr* AMD64Instr_Sse64Fx2 ( AMD64SseOp op, HReg src, HReg dst ) {
+   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag              = Ain_Sse64Fx2;
+   i->Ain.Sse64Fx2.op  = op;
+   i->Ain.Sse64Fx2.src = src;
+   i->Ain.Sse64Fx2.dst = dst;
+   vassert(op != Asse_MOV);
+   return i;
+}
+AMD64Instr* AMD64Instr_Sse64FLo ( AMD64SseOp op, HReg src, HReg dst ) {
+   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag              = Ain_Sse64FLo;
+   i->Ain.Sse64FLo.op  = op;
+   i->Ain.Sse64FLo.src = src;
+   i->Ain.Sse64FLo.dst = dst;
+   vassert(op != Asse_MOV);
+   return i;
+}
+AMD64Instr* AMD64Instr_SseReRg ( AMD64SseOp op, HReg re, HReg rg ) {
+   AMD64Instr* i      = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag             = Ain_SseReRg;
+   i->Ain.SseReRg.op  = op;
+   i->Ain.SseReRg.src = re;
+   i->Ain.SseReRg.dst = rg;
+   return i;
+}
+AMD64Instr* AMD64Instr_SseCMov ( AMD64CondCode cond, HReg src, HReg dst ) {
+   AMD64Instr* i       = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag              = Ain_SseCMov;
+   i->Ain.SseCMov.cond = cond;
+   i->Ain.SseCMov.src  = src;
+   i->Ain.SseCMov.dst  = dst;
+   vassert(cond != Acc_ALWAYS);
+   return i;
+}
+AMD64Instr* AMD64Instr_SseShuf ( Int order, HReg src, HReg dst ) {
+   AMD64Instr* i        = LibVEX_Alloc(sizeof(AMD64Instr));
+   i->tag               = Ain_SseShuf;
+   i->Ain.SseShuf.order = order;
+   i->Ain.SseShuf.src   = src;
+   i->Ain.SseShuf.dst   = dst;
+   vassert(order >= 0 && order <= 0xFF);
+   return i;
+}
+
+void ppAMD64Instr ( AMD64Instr* i, Bool mode64 ) 
+{
+   vassert(mode64 == True);
+   switch (i->tag) {
+      case Ain_Imm64: 
+         vex_printf("movabsq $0x%llx,", i->Ain.Imm64.imm64);
+         ppHRegAMD64(i->Ain.Imm64.dst);
+         return;
+      case Ain_Alu64R:
+         vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64R.op));
+         ppAMD64RMI(i->Ain.Alu64R.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.Alu64R.dst);
+         return;
+      case Ain_Alu64M:
+         vex_printf("%sq ", showAMD64AluOp(i->Ain.Alu64M.op));
+         ppAMD64RI(i->Ain.Alu64M.src);
+         vex_printf(",");
+         ppAMD64AMode(i->Ain.Alu64M.dst);
+         return;
+      case Ain_Sh64:
+         vex_printf("%sq ", showAMD64ShiftOp(i->Ain.Sh64.op));
+         if (i->Ain.Sh64.src == 0)
+            vex_printf("%%cl,"); 
+         else 
+            vex_printf("$%d,", (Int)i->Ain.Sh64.src);
+         ppHRegAMD64(i->Ain.Sh64.dst);
+         return;
+      case Ain_Test64:
+         vex_printf("testq $%d,", (Int)i->Ain.Test64.imm32);
+         ppHRegAMD64(i->Ain.Test64.dst);
+         return;
+      case Ain_Unary64:
+         vex_printf("%sq ", showAMD64UnaryOp(i->Ain.Unary64.op));
+         ppHRegAMD64(i->Ain.Unary64.dst);
+         return;
+      case Ain_Lea64:
+         vex_printf("leaq ");
+         ppAMD64AMode(i->Ain.Lea64.am);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.Lea64.dst);
+         return;
+      case Ain_MulL:
+         vex_printf("%cmulq ", i->Ain.MulL.syned ? 's' : 'u');
+         ppAMD64RM(i->Ain.MulL.src);
+         return;
+      case Ain_Div:
+         vex_printf("%cdiv%s ",
+                    i->Ain.Div.syned ? 's' : 'u',
+                    showAMD64ScalarSz(i->Ain.Div.sz));
+         ppAMD64RM(i->Ain.Div.src);
+         return;
+//..       case Xin_Sh3232:
+//..          vex_printf("%sdl ", showAMD64ShiftOp(i->Xin.Sh3232.op));
+//..          if (i->Xin.Sh3232.amt == 0)
+//..            vex_printf(" %%cl,"); 
+//..          else 
+//..             vex_printf(" $%d,", i->Xin.Sh3232.amt);
+//..          ppHRegAMD64(i->Xin.Sh3232.src);
+//..          vex_printf(",");
+//..          ppHRegAMD64(i->Xin.Sh3232.dst);
+//..          return;
+      case Ain_Push:
+         vex_printf("pushq ");
+         ppAMD64RMI(i->Ain.Push.src);
+         return;
+      case Ain_Call:
+         vex_printf("call%s[%d] ", 
+                    i->Ain.Call.cond==Acc_ALWAYS 
+                       ? "" : showAMD64CondCode(i->Ain.Call.cond),
+                    i->Ain.Call.regparms );
+         vex_printf("0x%llx", i->Ain.Call.target);
+         break;
+      case Ain_Goto:
+         if (i->Ain.Goto.cond != Acc_ALWAYS) {
+            vex_printf("if (%%rflags.%s) { ", 
+                       showAMD64CondCode(i->Ain.Goto.cond));
+         }
+         if (i->Ain.Goto.jk != Ijk_Boring
+             && i->Ain.Goto.jk != Ijk_Call
+             && i->Ain.Goto.jk != Ijk_Ret) {
+            vex_printf("movl $");
+            ppIRJumpKind(i->Ain.Goto.jk);
+            vex_printf(",%%ebp ; ");
+         }
+         vex_printf("movq ");
+         ppAMD64RI(i->Ain.Goto.dst);
+         vex_printf(",%%rax ; movabsq $dispatcher_addr,%%rdx ; jmp *%%rdx");
+         if (i->Ain.Goto.cond != Acc_ALWAYS) {
+            vex_printf(" }");
+         }
+         return;
+      case Ain_CMov64:
+         vex_printf("cmov%s ", showAMD64CondCode(i->Ain.CMov64.cond));
+         ppAMD64RM(i->Ain.CMov64.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.CMov64.dst);
+         return;
+      case Ain_MovxLQ:
+         vex_printf("mov%clq ", i->Ain.MovxLQ.syned ? 's' : 'z');
+         ppHRegAMD64_lo32(i->Ain.MovxLQ.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.MovxLQ.dst);
+         return;
+      case Ain_LoadEX:
+         if (i->Ain.LoadEX.szSmall==4 && !i->Ain.LoadEX.syned) {
+            vex_printf("movl ");
+            ppAMD64AMode(i->Ain.LoadEX.src);
+            vex_printf(",");
+            ppHRegAMD64_lo32(i->Ain.LoadEX.dst);
+         } else {
+            vex_printf("mov%c%cq ",
+                       i->Ain.LoadEX.syned ? 's' : 'z',
+                       i->Ain.LoadEX.szSmall==1 
+                          ? 'b' 
+                          : (i->Ain.LoadEX.szSmall==2 ? 'w' : 'l'));
+            ppAMD64AMode(i->Ain.LoadEX.src);
+            vex_printf(",");
+            ppHRegAMD64(i->Ain.LoadEX.dst);
+         }
+         return;
+      case Ain_Store:
+         vex_printf("mov%c ", i->Ain.Store.sz==1 ? 'b' 
+                              : (i->Ain.Store.sz==2 ? 'w' : 'l'));
+         ppHRegAMD64(i->Ain.Store.src);
+         vex_printf(",");
+         ppAMD64AMode(i->Ain.Store.dst);
+         return;
+      case Ain_Set64:
+         vex_printf("setq%s ", showAMD64CondCode(i->Ain.Set64.cond));
+         ppHRegAMD64(i->Ain.Set64.dst);
+         return;
+      case Ain_Bsfr64:
+         vex_printf("bs%cq ", i->Ain.Bsfr64.isFwds ? 'f' : 'r');
+         ppHRegAMD64(i->Ain.Bsfr64.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.Bsfr64.dst);
+         return;
+      case Ain_MFence:
+         vex_printf("mfence" );
+         return;
+      case Ain_ACAS:
+         vex_printf("lock cmpxchg%c ",
+                     i->Ain.ACAS.sz==1 ? 'b' : i->Ain.ACAS.sz==2 ? 'w' 
+                     : i->Ain.ACAS.sz==4 ? 'l' : 'q' );
+         vex_printf("{%%rax->%%rbx},");
+         ppAMD64AMode(i->Ain.ACAS.addr);
+         return;
+      case Ain_DACAS:
+         vex_printf("lock cmpxchg%db {%%rdx:%%rax->%%rcx:%%rbx},",
+                    (Int)(2 * i->Ain.DACAS.sz));
+         ppAMD64AMode(i->Ain.DACAS.addr);
+         return;
+      case Ain_A87Free:
+         vex_printf("ffree %%st(7..%d)", 8 - i->Ain.A87Free.nregs );
+         break;
+      case Ain_A87PushPop:
+         vex_printf(i->Ain.A87PushPop.isPush ? "fld%c " : "fstp%c ",
+                    i->Ain.A87PushPop.szB == 4 ? 's' : 'l');
+         ppAMD64AMode(i->Ain.A87PushPop.addr);
+         break;
+      case Ain_A87FpOp:
+         vex_printf("f%s", showA87FpOp(i->Ain.A87FpOp.op));
+         break;
+      case Ain_A87LdCW:
+         vex_printf("fldcw ");
+         ppAMD64AMode(i->Ain.A87LdCW.addr);
+         break;
+      case Ain_A87StSW:
+         vex_printf("fstsw ");
+         ppAMD64AMode(i->Ain.A87StSW.addr);
+         break;
+//..       case Xin_FpUnary:
+//..          vex_printf("g%sD ", showAMD64FpOp(i->Xin.FpUnary.op));
+//..          ppHRegAMD64(i->Xin.FpUnary.src);
+//..          vex_printf(",");
+//..          ppHRegAMD64(i->Xin.FpUnary.dst);
+//..          break;
+//..       case Xin_FpBinary:
+//..          vex_printf("g%sD ", showAMD64FpOp(i->Xin.FpBinary.op));
+//..          ppHRegAMD64(i->Xin.FpBinary.srcL);
+//..          vex_printf(",");
+//..          ppHRegAMD64(i->Xin.FpBinary.srcR);
+//..          vex_printf(",");
+//..          ppHRegAMD64(i->Xin.FpBinary.dst);
+//..          break;
+//..       case Xin_FpLdSt:
+//..          if (i->Xin.FpLdSt.isLoad) {
+//..             vex_printf("gld%c " , i->Xin.FpLdSt.sz==8 ? 'D' : 'F');
+//..             ppAMD64AMode(i->Xin.FpLdSt.addr);
+//..             vex_printf(", ");
+//..             ppHRegAMD64(i->Xin.FpLdSt.reg);
+//..          } else {
+//..             vex_printf("gst%c " , i->Xin.FpLdSt.sz==8 ? 'D' : 'F');
+//..             ppHRegAMD64(i->Xin.FpLdSt.reg);
+//..             vex_printf(", ");
+//..             ppAMD64AMode(i->Xin.FpLdSt.addr);
+//..          }
+//..          return;
+//..       case Xin_FpLdStI:
+//..          if (i->Xin.FpLdStI.isLoad) {
+//..             vex_printf("gild%s ", i->Xin.FpLdStI.sz==8 ? "ll" : 
+//..                                   i->Xin.FpLdStI.sz==4 ? "l" : "w");
+//..             ppAMD64AMode(i->Xin.FpLdStI.addr);
+//..             vex_printf(", ");
+//..             ppHRegAMD64(i->Xin.FpLdStI.reg);
+//..          } else {
+//..             vex_printf("gist%s ", i->Xin.FpLdStI.sz==8 ? "ll" : 
+//..                                   i->Xin.FpLdStI.sz==4 ? "l" : "w");
+//..             ppHRegAMD64(i->Xin.FpLdStI.reg);
+//..             vex_printf(", ");
+//..             ppAMD64AMode(i->Xin.FpLdStI.addr);
+//..          }
+//..          return;
+//..       case Xin_Fp64to32:
+//..          vex_printf("gdtof ");
+//..          ppHRegAMD64(i->Xin.Fp64to32.src);
+//..          vex_printf(",");
+//..          ppHRegAMD64(i->Xin.Fp64to32.dst);
+//..          return;
+//..       case Xin_FpCMov:
+//..          vex_printf("gcmov%s ", showAMD64CondCode(i->Xin.FpCMov.cond));
+//..          ppHRegAMD64(i->Xin.FpCMov.src);
+//..          vex_printf(",");
+//..          ppHRegAMD64(i->Xin.FpCMov.dst);
+//..          return;
+//..       case Xin_FpLdStCW:
+//..          vex_printf(i->Xin.FpLdStCW.isLoad ? "fldcw " : "fstcw ");
+//..          ppAMD64AMode(i->Xin.FpLdStCW.addr);
+//..          return;
+//..       case Xin_FpStSW_AX:
+//..          vex_printf("fstsw %%ax");
+//..          return;
+      case Ain_LdMXCSR:
+         vex_printf("ldmxcsr ");
+         ppAMD64AMode(i->Ain.LdMXCSR.addr);
+         break;
+      case Ain_SseUComIS:
+         vex_printf("ucomis%s ", i->Ain.SseUComIS.sz==4 ? "s" : "d");
+         ppHRegAMD64(i->Ain.SseUComIS.srcL);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.SseUComIS.srcR);
+         vex_printf(" ; pushfq ; popq ");
+         ppHRegAMD64(i->Ain.SseUComIS.dst);
+         break;
+      case Ain_SseSI2SF:
+         vex_printf("cvtsi2s%s ", i->Ain.SseSI2SF.szD==4 ? "s" : "d");
+         (i->Ain.SseSI2SF.szS==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
+            (i->Ain.SseSI2SF.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.SseSI2SF.dst);
+         break;
+      case Ain_SseSF2SI:
+         vex_printf("cvts%s2si ", i->Ain.SseSF2SI.szS==4 ? "s" : "d");
+         ppHRegAMD64(i->Ain.SseSF2SI.src);
+         vex_printf(",");
+         (i->Ain.SseSF2SI.szD==4 ? ppHRegAMD64_lo32 : ppHRegAMD64)
+            (i->Ain.SseSF2SI.dst);
+         break;
+      case Ain_SseSDSS:
+         vex_printf(i->Ain.SseSDSS.from64 ? "cvtsd2ss " : "cvtss2sd ");
+         ppHRegAMD64(i->Ain.SseSDSS.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.SseSDSS.dst);
+         break;
+//..       case Xin_SseConst:
+//..          vex_printf("const $0x%04x,", (Int)i->Xin.SseConst.con);
+//..          ppHRegAMD64(i->Xin.SseConst.dst);
+//..          break;
+      case Ain_SseLdSt:
+         switch (i->Ain.SseLdSt.sz) {
+            case 4:  vex_printf("movss "); break;
+            case 8:  vex_printf("movsd "); break;
+            case 16: vex_printf("movups "); break;
+            default: vassert(0);
+         }
+         if (i->Ain.SseLdSt.isLoad) {
+            ppAMD64AMode(i->Ain.SseLdSt.addr);
+            vex_printf(",");
+            ppHRegAMD64(i->Ain.SseLdSt.reg);
+         } else {
+            ppHRegAMD64(i->Ain.SseLdSt.reg);
+            vex_printf(",");
+            ppAMD64AMode(i->Ain.SseLdSt.addr);
+         }
+         return;
+      case Ain_SseLdzLO:
+         vex_printf("movs%s ", i->Ain.SseLdzLO.sz==4 ? "s" : "d");
+         ppAMD64AMode(i->Ain.SseLdzLO.addr);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.SseLdzLO.reg);
+         return;
+      case Ain_Sse32Fx4:
+         vex_printf("%sps ", showAMD64SseOp(i->Ain.Sse32Fx4.op));
+         ppHRegAMD64(i->Ain.Sse32Fx4.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.Sse32Fx4.dst);
+         return;
+      case Ain_Sse32FLo:
+         vex_printf("%sss ", showAMD64SseOp(i->Ain.Sse32FLo.op));
+         ppHRegAMD64(i->Ain.Sse32FLo.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.Sse32FLo.dst);
+         return;
+      case Ain_Sse64Fx2:
+         vex_printf("%spd ", showAMD64SseOp(i->Ain.Sse64Fx2.op));
+         ppHRegAMD64(i->Ain.Sse64Fx2.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.Sse64Fx2.dst);
+         return;
+      case Ain_Sse64FLo:
+         vex_printf("%ssd ", showAMD64SseOp(i->Ain.Sse64FLo.op));
+         ppHRegAMD64(i->Ain.Sse64FLo.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.Sse64FLo.dst);
+         return;
+      case Ain_SseReRg:
+         vex_printf("%s ", showAMD64SseOp(i->Ain.SseReRg.op));
+         ppHRegAMD64(i->Ain.SseReRg.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.SseReRg.dst);
+         return;
+      case Ain_SseCMov:
+         vex_printf("cmov%s ", showAMD64CondCode(i->Ain.SseCMov.cond));
+         ppHRegAMD64(i->Ain.SseCMov.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.SseCMov.dst);
+         return;
+      case Ain_SseShuf:
+         vex_printf("pshufd $0x%x,", i->Ain.SseShuf.order);
+         ppHRegAMD64(i->Ain.SseShuf.src);
+         vex_printf(",");
+         ppHRegAMD64(i->Ain.SseShuf.dst);
+         return;
+
+      default:
+         vpanic("ppAMD64Instr");
+   }
+}
+
+/* --------- Helpers for register allocation. --------- */
+
+void getRegUsage_AMD64Instr ( HRegUsage* u, AMD64Instr* i, Bool mode64 )
+{
+   Bool unary;
+   vassert(mode64 == True);
+   initHRegUsage(u);
+   switch (i->tag) {
+      case Ain_Imm64:
+         addHRegUse(u, HRmWrite, i->Ain.Imm64.dst);
+         return;
+      case Ain_Alu64R:
+         addRegUsage_AMD64RMI(u, i->Ain.Alu64R.src);
+         if (i->Ain.Alu64R.op == Aalu_MOV) {
+            addHRegUse(u, HRmWrite, i->Ain.Alu64R.dst);
+            return;
+         }
+         if (i->Ain.Alu64R.op == Aalu_CMP) { 
+            addHRegUse(u, HRmRead, i->Ain.Alu64R.dst);
+            return;
+         }
+         addHRegUse(u, HRmModify, i->Ain.Alu64R.dst);
+         return;
+      case Ain_Alu64M:
+         addRegUsage_AMD64RI(u, i->Ain.Alu64M.src);
+         addRegUsage_AMD64AMode(u, i->Ain.Alu64M.dst);
+         return;
+      case Ain_Sh64:
+         addHRegUse(u, HRmModify, i->Ain.Sh64.dst);
+         if (i->Ain.Sh64.src == 0)
+            addHRegUse(u, HRmRead, hregAMD64_RCX());
+         return;
+      case Ain_Test64:
+         addHRegUse(u, HRmRead, i->Ain.Test64.dst);
+         return;
+      case Ain_Unary64:
+         addHRegUse(u, HRmModify, i->Ain.Unary64.dst);
+         return;
+      case Ain_Lea64:
+         addRegUsage_AMD64AMode(u, i->Ain.Lea64.am);
+         addHRegUse(u, HRmWrite, i->Ain.Lea64.dst);
+         return;
+      case Ain_MulL:
+         addRegUsage_AMD64RM(u, i->Ain.MulL.src, HRmRead);
+         addHRegUse(u, HRmModify, hregAMD64_RAX());
+         addHRegUse(u, HRmWrite, hregAMD64_RDX());
+         return;
+      case Ain_Div:
+         addRegUsage_AMD64RM(u, i->Ain.Div.src, HRmRead);
+         addHRegUse(u, HRmModify, hregAMD64_RAX());
+         addHRegUse(u, HRmModify, hregAMD64_RDX());
+         return;
+//..       case Xin_Sh3232:
+//..          addHRegUse(u, HRmRead, i->Xin.Sh3232.src);
+//..          addHRegUse(u, HRmModify, i->Xin.Sh3232.dst);
+//..          if (i->Xin.Sh3232.amt == 0)
+//..             addHRegUse(u, HRmRead, hregAMD64_ECX());
+//..          return;
+      case Ain_Push:
+         addRegUsage_AMD64RMI(u, i->Ain.Push.src);
+         addHRegUse(u, HRmModify, hregAMD64_RSP());
+         return;
+      case Ain_Call:
+         /* This is a bit subtle. */
+         /* First off, claim it trashes all the caller-saved regs
+            which fall within the register allocator's jurisdiction.
+            These I believe to be: rax rcx rdx rsi rdi r8 r9 r10 r11 
+            and all the xmm registers.
+         */
+         addHRegUse(u, HRmWrite, hregAMD64_RAX());
+         addHRegUse(u, HRmWrite, hregAMD64_RCX());
+         addHRegUse(u, HRmWrite, hregAMD64_RDX());
+         addHRegUse(u, HRmWrite, hregAMD64_RSI());
+         addHRegUse(u, HRmWrite, hregAMD64_RDI());
+         addHRegUse(u, HRmWrite, hregAMD64_R8());
+         addHRegUse(u, HRmWrite, hregAMD64_R9());
+         addHRegUse(u, HRmWrite, hregAMD64_R10());
+         addHRegUse(u, HRmWrite, hregAMD64_R11());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM0());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM1());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM2());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM3());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM4());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM5());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM6());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM7());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM8());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM9());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM10());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM11());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM12());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM13());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM14());
+         addHRegUse(u, HRmWrite, hregAMD64_XMM15());
+
+         /* Now we have to state any parameter-carrying registers
+            which might be read.  This depends on the regparmness. */
+         switch (i->Ain.Call.regparms) {
+            case 6: addHRegUse(u, HRmRead, hregAMD64_R9());  /*fallthru*/
+            case 5: addHRegUse(u, HRmRead, hregAMD64_R8());  /*fallthru*/
+            case 4: addHRegUse(u, HRmRead, hregAMD64_RCX()); /*fallthru*/
+            case 3: addHRegUse(u, HRmRead, hregAMD64_RDX()); /*fallthru*/
+            case 2: addHRegUse(u, HRmRead, hregAMD64_RSI()); /*fallthru*/
+            case 1: addHRegUse(u, HRmRead, hregAMD64_RDI()); break;
+            case 0: break;
+            default: vpanic("getRegUsage_AMD64Instr:Call:regparms");
+         }
+         /* Finally, there is the issue that the insn trashes a
+            register because the literal target address has to be
+            loaded into a register.  Fortunately, r11 is stated in the
+            ABI as a scratch register, and so seems a suitable victim.  */
+         addHRegUse(u, HRmWrite, hregAMD64_R11());
+         /* Upshot of this is that the assembler really must use r11,
+            and no other, as a destination temporary. */
+         return;
+      case Ain_Goto:
+         addRegUsage_AMD64RI(u, i->Ain.Goto.dst);
+         addHRegUse(u, HRmWrite, hregAMD64_RAX()); /* used for next guest addr */
+         addHRegUse(u, HRmWrite, hregAMD64_RDX()); /* used for dispatcher addr */
+         if (i->Ain.Goto.jk != Ijk_Boring
+             && i->Ain.Goto.jk != Ijk_Call
+             && i->Ain.Goto.jk != Ijk_Ret)
+            /* note, this is irrelevant since rbp is not actually
+               available to the allocator.  But still .. */
+            addHRegUse(u, HRmWrite, hregAMD64_RBP());
+         return;
+      case Ain_CMov64:
+         addRegUsage_AMD64RM(u, i->Ain.CMov64.src, HRmRead);
+         addHRegUse(u, HRmModify, i->Ain.CMov64.dst);
+         return;
+      case Ain_MovxLQ:
+         addHRegUse(u, HRmRead,  i->Ain.MovxLQ.src);
+         addHRegUse(u, HRmWrite, i->Ain.MovxLQ.dst);
+         return;
+      case Ain_LoadEX:
+         addRegUsage_AMD64AMode(u, i->Ain.LoadEX.src);
+         addHRegUse(u, HRmWrite, i->Ain.LoadEX.dst);
+         return;
+      case Ain_Store:
+         addHRegUse(u, HRmRead, i->Ain.Store.src);
+         addRegUsage_AMD64AMode(u, i->Ain.Store.dst);
+         return;
+      case Ain_Set64:
+         addHRegUse(u, HRmWrite, i->Ain.Set64.dst);
+         return;
+      case Ain_Bsfr64:
+         addHRegUse(u, HRmRead, i->Ain.Bsfr64.src);
+         addHRegUse(u, HRmWrite, i->Ain.Bsfr64.dst);
+         return;
+      case Ain_MFence:
+         return;
+      case Ain_ACAS:
+         addRegUsage_AMD64AMode(u, i->Ain.ACAS.addr);
+         addHRegUse(u, HRmRead, hregAMD64_RBX());
+         addHRegUse(u, HRmModify, hregAMD64_RAX());
+         return;
+      case Ain_DACAS:
+         addRegUsage_AMD64AMode(u, i->Ain.DACAS.addr);
+         addHRegUse(u, HRmRead, hregAMD64_RCX());
+         addHRegUse(u, HRmRead, hregAMD64_RBX());
+         addHRegUse(u, HRmModify, hregAMD64_RDX());
+         addHRegUse(u, HRmModify, hregAMD64_RAX());
+         return;
+      case Ain_A87Free:
+         return;
+      case Ain_A87PushPop:
+         addRegUsage_AMD64AMode(u, i->Ain.A87PushPop.addr);
+         return;
+      case Ain_A87FpOp:
+         return;
+      case Ain_A87LdCW:
+         addRegUsage_AMD64AMode(u, i->Ain.A87LdCW.addr);
+         return;
+      case Ain_A87StSW:
+         addRegUsage_AMD64AMode(u, i->Ain.A87StSW.addr);
+         return;
+//..       case Xin_FpUnary:
+//..          addHRegUse(u, HRmRead, i->Xin.FpUnary.src);
+//..          addHRegUse(u, HRmWrite, i->Xin.FpUnary.dst);
+//..          return;
+//..       case Xin_FpBinary:
+//..          addHRegUse(u, HRmRead, i->Xin.FpBinary.srcL);
+//..          addHRegUse(u, HRmRead, i->Xin.FpBinary.srcR);
+//..          addHRegUse(u, HRmWrite, i->Xin.FpBinary.dst);
+//..          return;
+//..       case Xin_FpLdSt:
+//..          addRegUsage_AMD64AMode(u, i->Xin.FpLdSt.addr);
+//..          addHRegUse(u, i->Xin.FpLdSt.isLoad ? HRmWrite : HRmRead,
+//..                        i->Xin.FpLdSt.reg);
+//..          return;
+//..       case Xin_FpLdStI:
+//..          addRegUsage_AMD64AMode(u, i->Xin.FpLdStI.addr);
+//..          addHRegUse(u, i->Xin.FpLdStI.isLoad ? HRmWrite : HRmRead,
+//..                        i->Xin.FpLdStI.reg);
+//..          return;
+//..       case Xin_Fp64to32:
+//..          addHRegUse(u, HRmRead,  i->Xin.Fp64to32.src);
+//..          addHRegUse(u, HRmWrite, i->Xin.Fp64to32.dst);
+//..          return;
+//..       case Xin_FpCMov:
+//..          addHRegUse(u, HRmRead,   i->Xin.FpCMov.src);
+//..          addHRegUse(u, HRmModify, i->Xin.FpCMov.dst);
+//..          return;
+      case Ain_LdMXCSR:
+         addRegUsage_AMD64AMode(u, i->Ain.LdMXCSR.addr);
+         return;
+//..       case Xin_FpStSW_AX:
+//..          addHRegUse(u, HRmWrite, hregAMD64_EAX());
+//..          return;
+      case Ain_SseUComIS:
+         addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcL);
+         addHRegUse(u, HRmRead,  i->Ain.SseUComIS.srcR);
+         addHRegUse(u, HRmWrite, i->Ain.SseUComIS.dst);
+         return;
+      case Ain_SseSI2SF:
+         addHRegUse(u, HRmRead,  i->Ain.SseSI2SF.src);
+         addHRegUse(u, HRmWrite, i->Ain.SseSI2SF.dst);
+         return;
+      case Ain_SseSF2SI:
+         addHRegUse(u, HRmRead,  i->Ain.SseSF2SI.src);
+         addHRegUse(u, HRmWrite, i->Ain.SseSF2SI.dst);
+         return;
+      case Ain_SseSDSS:
+         addHRegUse(u, HRmRead,  i->Ain.SseSDSS.src);
+         addHRegUse(u, HRmWrite, i->Ain.SseSDSS.dst);
+         return;
+      case Ain_SseLdSt:
+         addRegUsage_AMD64AMode(u, i->Ain.SseLdSt.addr);
+         addHRegUse(u, i->Ain.SseLdSt.isLoad ? HRmWrite : HRmRead,
+                       i->Ain.SseLdSt.reg);
+         return;
+      case Ain_SseLdzLO:
+         addRegUsage_AMD64AMode(u, i->Ain.SseLdzLO.addr);
+         addHRegUse(u, HRmWrite, i->Ain.SseLdzLO.reg);
+         return;
+//..       case Xin_SseConst:
+//..          addHRegUse(u, HRmWrite, i->Xin.SseConst.dst);
+//..          return;
+      case Ain_Sse32Fx4:
+         vassert(i->Ain.Sse32Fx4.op != Asse_MOV);
+         unary = toBool( i->Ain.Sse32Fx4.op == Asse_RCPF
+                         || i->Ain.Sse32Fx4.op == Asse_RSQRTF
+                         || i->Ain.Sse32Fx4.op == Asse_SQRTF );
+         addHRegUse(u, HRmRead, i->Ain.Sse32Fx4.src);
+         addHRegUse(u, unary ? HRmWrite : HRmModify, 
+                       i->Ain.Sse32Fx4.dst);
+         return;
+      case Ain_Sse32FLo:
+         vassert(i->Ain.Sse32FLo.op != Asse_MOV);
+         unary = toBool( i->Ain.Sse32FLo.op == Asse_RCPF
+                         || i->Ain.Sse32FLo.op == Asse_RSQRTF
+                         || i->Ain.Sse32FLo.op == Asse_SQRTF );
+         addHRegUse(u, HRmRead, i->Ain.Sse32FLo.src);
+         addHRegUse(u, unary ? HRmWrite : HRmModify, 
+                       i->Ain.Sse32FLo.dst);
+         return;
+      case Ain_Sse64Fx2:
+         vassert(i->Ain.Sse64Fx2.op != Asse_MOV);
+         unary = toBool( i->Ain.Sse64Fx2.op == Asse_RCPF
+                         || i->Ain.Sse64Fx2.op == Asse_RSQRTF
+                         || i->Ain.Sse64Fx2.op == Asse_SQRTF );
+         addHRegUse(u, HRmRead, i->Ain.Sse64Fx2.src);
+         addHRegUse(u, unary ? HRmWrite : HRmModify, 
+                       i->Ain.Sse64Fx2.dst);
+         return;
+      case Ain_Sse64FLo:
+         vassert(i->Ain.Sse64FLo.op != Asse_MOV);
+         unary = toBool( i->Ain.Sse64FLo.op == Asse_RCPF
+                         || i->Ain.Sse64FLo.op == Asse_RSQRTF
+                         || i->Ain.Sse64FLo.op == Asse_SQRTF );
+         addHRegUse(u, HRmRead, i->Ain.Sse64FLo.src);
+         addHRegUse(u, unary ? HRmWrite : HRmModify, 
+                       i->Ain.Sse64FLo.dst);
+         return;
+      case Ain_SseReRg:
+         if ( (i->Ain.SseReRg.op == Asse_XOR
+               || i->Ain.SseReRg.op == Asse_CMPEQ32)
+              && i->Ain.SseReRg.src == i->Ain.SseReRg.dst) {
+            /* reg-alloc needs to understand 'xor r,r' and 'cmpeqd
+               r,r' as a write of a value to r, and independent of any
+               previous value in r */
+            /* (as opposed to a rite of passage :-) */
+            addHRegUse(u, HRmWrite, i->Ain.SseReRg.dst);
+         } else {
+            addHRegUse(u, HRmRead, i->Ain.SseReRg.src);
+            addHRegUse(u, i->Ain.SseReRg.op == Asse_MOV 
+                             ? HRmWrite : HRmModify, 
+                          i->Ain.SseReRg.dst);
+         }
+         return;
+      case Ain_SseCMov:
+         addHRegUse(u, HRmRead,   i->Ain.SseCMov.src);
+         addHRegUse(u, HRmModify, i->Ain.SseCMov.dst);
+         return;
+      case Ain_SseShuf:
+         addHRegUse(u, HRmRead,  i->Ain.SseShuf.src);
+         addHRegUse(u, HRmWrite, i->Ain.SseShuf.dst);
+         return;
+      default:
+         ppAMD64Instr(i, mode64);
+         vpanic("getRegUsage_AMD64Instr");
+   }
+}
+
+/* local helper */
+static inline void mapReg(HRegRemap* m, HReg* r)
+{
+   *r = lookupHRegRemap(m, *r);
+}
+
+void mapRegs_AMD64Instr ( HRegRemap* m, AMD64Instr* i, Bool mode64 )
+{
+   vassert(mode64 == True);
+   switch (i->tag) {
+      case Ain_Imm64:
+         mapReg(m, &i->Ain.Imm64.dst);
+         return;
+      case Ain_Alu64R:
+         mapRegs_AMD64RMI(m, i->Ain.Alu64R.src);
+         mapReg(m, &i->Ain.Alu64R.dst);
+         return;
+      case Ain_Alu64M:
+         mapRegs_AMD64RI(m, i->Ain.Alu64M.src);
+         mapRegs_AMD64AMode(m, i->Ain.Alu64M.dst);
+         return;
+      case Ain_Sh64:
+         mapReg(m, &i->Ain.Sh64.dst);
+         return;
+      case Ain_Test64:
+         mapReg(m, &i->Ain.Test64.dst);
+         return;
+      case Ain_Unary64:
+         mapReg(m, &i->Ain.Unary64.dst);
+         return;
+      case Ain_Lea64:
+         mapRegs_AMD64AMode(m, i->Ain.Lea64.am);
+         mapReg(m, &i->Ain.Lea64.dst);
+         return;
+      case Ain_MulL:
+         mapRegs_AMD64RM(m, i->Ain.MulL.src);
+         return;
+      case Ain_Div:
+         mapRegs_AMD64RM(m, i->Ain.Div.src);
+         return;
+//..       case Xin_Sh3232:
+//..          mapReg(m, &i->Xin.Sh3232.src);
+//..          mapReg(m, &i->Xin.Sh3232.dst);
+//..          return;
+      case Ain_Push:
+         mapRegs_AMD64RMI(m, i->Ain.Push.src);
+         return;
+      case Ain_Call:
+         return;
+      case Ain_Goto:
+         mapRegs_AMD64RI(m, i->Ain.Goto.dst);
+         return;
+      case Ain_CMov64:
+         mapRegs_AMD64RM(m, i->Ain.CMov64.src);
+         mapReg(m, &i->Ain.CMov64.dst);
+         return;
+      case Ain_MovxLQ:
+         mapReg(m, &i->Ain.MovxLQ.src);
+         mapReg(m, &i->Ain.MovxLQ.dst);
+         return;
+      case Ain_LoadEX:
+         mapRegs_AMD64AMode(m, i->Ain.LoadEX.src);
+         mapReg(m, &i->Ain.LoadEX.dst);
+         return;
+      case Ain_Store:
+         mapReg(m, &i->Ain.Store.src);
+         mapRegs_AMD64AMode(m, i->Ain.Store.dst);
+         return;
+      case Ain_Set64:
+         mapReg(m, &i->Ain.Set64.dst);
+         return;
+      case Ain_Bsfr64:
+         mapReg(m, &i->Ain.Bsfr64.src);
+         mapReg(m, &i->Ain.Bsfr64.dst);
+         return;
+      case Ain_MFence:
+         return;
+      case Ain_ACAS:
+         mapRegs_AMD64AMode(m, i->Ain.ACAS.addr);
+         return;
+      case Ain_DACAS:
+         mapRegs_AMD64AMode(m, i->Ain.DACAS.addr);
+         return;
+      case Ain_A87Free:
+         return;
+      case Ain_A87PushPop:
+         mapRegs_AMD64AMode(m, i->Ain.A87PushPop.addr);
+         return;
+      case Ain_A87FpOp:
+         return;
+      case Ain_A87LdCW:
+         mapRegs_AMD64AMode(m, i->Ain.A87LdCW.addr);
+         return;
+      case Ain_A87StSW:
+         mapRegs_AMD64AMode(m, i->Ain.A87StSW.addr);
+         return;
+//..       case Xin_FpUnary:
+//..          mapReg(m, &i->Xin.FpUnary.src);
+//..          mapReg(m, &i->Xin.FpUnary.dst);
+//..          return;
+//..       case Xin_FpBinary:
+//..          mapReg(m, &i->Xin.FpBinary.srcL);
+//..          mapReg(m, &i->Xin.FpBinary.srcR);
+//..          mapReg(m, &i->Xin.FpBinary.dst);
+//..          return;
+//..       case Xin_FpLdSt:
+//..          mapRegs_AMD64AMode(m, i->Xin.FpLdSt.addr);
+//..          mapReg(m, &i->Xin.FpLdSt.reg);
+//..          return;
+//..       case Xin_FpLdStI:
+//..          mapRegs_AMD64AMode(m, i->Xin.FpLdStI.addr);
+//..          mapReg(m, &i->Xin.FpLdStI.reg);
+//..          return;
+//..       case Xin_Fp64to32:
+//..          mapReg(m, &i->Xin.Fp64to32.src);
+//..          mapReg(m, &i->Xin.Fp64to32.dst);
+//..          return;
+//..       case Xin_FpCMov:
+//..          mapReg(m, &i->Xin.FpCMov.src);
+//..          mapReg(m, &i->Xin.FpCMov.dst);
+//..          return;
+      case Ain_LdMXCSR:
+         mapRegs_AMD64AMode(m, i->Ain.LdMXCSR.addr);
+         return;
+//..       case Xin_FpStSW_AX:
+//..          return;
+      case Ain_SseUComIS:
+         mapReg(m, &i->Ain.SseUComIS.srcL);
+         mapReg(m, &i->Ain.SseUComIS.srcR);
+         mapReg(m, &i->Ain.SseUComIS.dst);
+         return;
+      case Ain_SseSI2SF:
+         mapReg(m, &i->Ain.SseSI2SF.src);
+         mapReg(m, &i->Ain.SseSI2SF.dst);
+         return;
+      case Ain_SseSF2SI:
+         mapReg(m, &i->Ain.SseSF2SI.src);
+         mapReg(m, &i->Ain.SseSF2SI.dst);
+         return;
+      case Ain_SseSDSS:
+         mapReg(m, &i->Ain.SseSDSS.src);
+         mapReg(m, &i->Ain.SseSDSS.dst);
+         return;
+//..       case Xin_SseConst:
+//..          mapReg(m, &i->Xin.SseConst.dst);
+//..          return;
+      case Ain_SseLdSt:
+         mapReg(m, &i->Ain.SseLdSt.reg);
+         mapRegs_AMD64AMode(m, i->Ain.SseLdSt.addr);
+         break;
+      case Ain_SseLdzLO:
+         mapReg(m, &i->Ain.SseLdzLO.reg);
+         mapRegs_AMD64AMode(m, i->Ain.SseLdzLO.addr);
+         break;
+      case Ain_Sse32Fx4:
+         mapReg(m, &i->Ain.Sse32Fx4.src);
+         mapReg(m, &i->Ain.Sse32Fx4.dst);
+         return;
+      case Ain_Sse32FLo:
+         mapReg(m, &i->Ain.Sse32FLo.src);
+         mapReg(m, &i->Ain.Sse32FLo.dst);
+         return;
+      case Ain_Sse64Fx2:
+         mapReg(m, &i->Ain.Sse64Fx2.src);
+         mapReg(m, &i->Ain.Sse64Fx2.dst);
+         return;
+      case Ain_Sse64FLo:
+         mapReg(m, &i->Ain.Sse64FLo.src);
+         mapReg(m, &i->Ain.Sse64FLo.dst);
+         return;
+      case Ain_SseReRg:
+         mapReg(m, &i->Ain.SseReRg.src);
+         mapReg(m, &i->Ain.SseReRg.dst);
+         return;
+      case Ain_SseCMov:
+         mapReg(m, &i->Ain.SseCMov.src);
+         mapReg(m, &i->Ain.SseCMov.dst);
+         return;
+      case Ain_SseShuf:
+         mapReg(m, &i->Ain.SseShuf.src);
+         mapReg(m, &i->Ain.SseShuf.dst);
+         return;
+      default:
+         ppAMD64Instr(i, mode64);
+         vpanic("mapRegs_AMD64Instr");
+   }
+}
+
+/* Figure out if i represents a reg-reg move, and if so assign the
+   source and destination to *src and *dst.  If in doubt say No.  Used
+   by the register allocator to do move coalescing. 
+*/
+Bool isMove_AMD64Instr ( AMD64Instr* i, HReg* src, HReg* dst )
+{
+   /* Moves between integer regs */
+   if (i->tag == Ain_Alu64R) {
+      if (i->Ain.Alu64R.op != Aalu_MOV)
+         return False;
+      if (i->Ain.Alu64R.src->tag != Armi_Reg)
+         return False;
+      *src = i->Ain.Alu64R.src->Armi.Reg.reg;
+      *dst = i->Ain.Alu64R.dst;
+      return True;
+   }
+   /* Moves between vector regs */
+   if (i->tag == Ain_SseReRg) {
+      if (i->Ain.SseReRg.op != Asse_MOV)
+         return False;
+      *src = i->Ain.SseReRg.src;
+      *dst = i->Ain.SseReRg.dst;
+      return True;
+   }
+   return False;
+}
+
+
+/* Generate amd64 spill/reload instructions under the direction of the
+   register allocator.  Note it's critical these don't write the
+   condition codes. */
+
+void genSpill_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                      HReg rreg, Int offsetB, Bool mode64 )
+{
+   AMD64AMode* am;
+   vassert(offsetB >= 0);
+   vassert(!hregIsVirtual(rreg));
+   vassert(mode64 == True);
+   *i1 = *i2 = NULL;
+   am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
+   switch (hregClass(rreg)) {
+      case HRcInt64:
+         *i1 = AMD64Instr_Alu64M ( Aalu_MOV, AMD64RI_Reg(rreg), am );
+         return;
+      case HRcVec128:
+         *i1 = AMD64Instr_SseLdSt ( False/*store*/, 16, rreg, am );
+         return;
+      default: 
+         ppHRegClass(hregClass(rreg));
+         vpanic("genSpill_AMD64: unimplemented regclass");
+   }
+}
+
+void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                       HReg rreg, Int offsetB, Bool mode64 )
+{
+   AMD64AMode* am;
+   vassert(offsetB >= 0);
+   vassert(!hregIsVirtual(rreg));
+   vassert(mode64 == True);
+   *i1 = *i2 = NULL;
+   am = AMD64AMode_IR(offsetB, hregAMD64_RBP());
+   switch (hregClass(rreg)) {
+      case HRcInt64:
+         *i1 = AMD64Instr_Alu64R ( Aalu_MOV, AMD64RMI_Mem(am), rreg );
+         return;
+      case HRcVec128:
+         *i1 = AMD64Instr_SseLdSt ( True/*load*/, 16, rreg, am );
+         return;
+      default: 
+         ppHRegClass(hregClass(rreg));
+         vpanic("genReload_AMD64: unimplemented regclass");
+   }
+}
+
+
+/* --------- The amd64 assembler (bleh.) --------- */
+
+/* Produce the low three bits of an integer register number. */
+static UChar iregBits210 ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcInt64);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 15);
+   return toUChar(n & 7);
+}
+
+/* Produce bit 3 of an integer register number. */
+static UChar iregBit3 ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcInt64);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 15);
+   return toUChar((n >> 3) & 1);
+}
+
+/* Produce a complete 4-bit integer register number. */
+static UChar iregBits3210 ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcInt64);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 15);
+   return toUChar(n);
+}
+
+/* Given an xmm (128bit V-class) register number, produce the
+   equivalent numbered register in 64-bit I-class.  This is a bit of
+   fakery which facilitates using functions that work on integer
+   register numbers to be used when assembling SSE instructions
+   too. */
+static UInt vreg2ireg ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcVec128);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 15);
+   return mkHReg(n, HRcInt64, False);
+}
+
+static UChar mkModRegRM ( UChar mod, UChar reg, UChar regmem )
+{
+   return toUChar( ((mod & 3) << 6) 
+                   | ((reg & 7) << 3) 
+                   | (regmem & 7) );
+}
+
+static UChar mkSIB ( Int shift, Int regindex, Int regbase )
+{
+   return toUChar( ((shift & 3) << 6) 
+                   | ((regindex & 7) << 3) 
+                   | (regbase & 7) );
+}
+
+static UChar* emit32 ( UChar* p, UInt w32 )
+{
+   *p++ = toUChar((w32)       & 0x000000FF);
+   *p++ = toUChar((w32 >>  8) & 0x000000FF);
+   *p++ = toUChar((w32 >> 16) & 0x000000FF);
+   *p++ = toUChar((w32 >> 24) & 0x000000FF);
+   return p;
+}
+
+static UChar* emit64 ( UChar* p, ULong w64 )
+{
+   p = emit32(p, toUInt(w64         & 0xFFFFFFFF));
+   p = emit32(p, toUInt((w64 >> 32) & 0xFFFFFFFF));
+   return p;
+}
+
+/* Does a sign-extend of the lowest 8 bits give 
+   the original number? */
+static Bool fits8bits ( UInt w32 )
+{
+   Int i32 = (Int)w32;
+   return toBool(i32 == ((i32 << 24) >> 24));
+}
+/* Can the lower 32 bits be signedly widened to produce the whole
+   64-bit value?  In other words, are the top 33 bits either all 0 or
+   all 1 ? */
+static Bool fitsIn32Bits ( ULong x )
+{
+   Long y0 = (Long)x;
+   Long y1 = y0;
+   y1 <<= 32;
+   y1 >>=/*s*/ 32;
+   return toBool(x == y1);
+}
+
+
+/* Forming mod-reg-rm bytes and scale-index-base bytes.
+
+     greg,  0(ereg)    |  ereg is not any of: RSP RBP R12 R13
+                       =  00 greg ereg
+
+     greg,  d8(ereg)   |  ereg is neither of: RSP R12
+                       =  01 greg ereg, d8
+
+     greg,  d32(ereg)  |  ereg is neither of: RSP R12
+                       =  10 greg ereg, d32
+
+     greg,  d8(ereg)   |  ereg is either: RSP R12
+                       =  01 greg 100, 0x24, d8
+                       (lowest bit of rex distinguishes R12/RSP)
+
+     greg,  d32(ereg)  |  ereg is either: RSP R12
+                       =  10 greg 100, 0x24, d32
+                       (lowest bit of rex distinguishes R12/RSP)
+
+     -----------------------------------------------
+
+     greg,  d8(base,index,scale)  
+               |  index != RSP
+               =  01 greg 100, scale index base, d8
+
+     greg,  d32(base,index,scale)
+               |  index != RSP
+               =  10 greg 100, scale index base, d32
+*/
+static UChar* doAMode_M ( UChar* p, HReg greg, AMD64AMode* am ) 
+{
+   if (am->tag == Aam_IR) {
+      if (am->Aam.IR.imm == 0 
+          && am->Aam.IR.reg != hregAMD64_RSP()
+          && am->Aam.IR.reg != hregAMD64_RBP() 
+          && am->Aam.IR.reg != hregAMD64_R12() 
+          && am->Aam.IR.reg != hregAMD64_R13() 
+         ) {
+         *p++ = mkModRegRM(0, iregBits210(greg), 
+                              iregBits210(am->Aam.IR.reg));
+         return p;
+      }
+      if (fits8bits(am->Aam.IR.imm)
+          && am->Aam.IR.reg != hregAMD64_RSP()
+          && am->Aam.IR.reg != hregAMD64_R12()
+         ) {
+         *p++ = mkModRegRM(1, iregBits210(greg), 
+                              iregBits210(am->Aam.IR.reg));
+         *p++ = toUChar(am->Aam.IR.imm & 0xFF);
+         return p;
+      }
+      if (am->Aam.IR.reg != hregAMD64_RSP()
+          && am->Aam.IR.reg != hregAMD64_R12()
+         ) {
+         *p++ = mkModRegRM(2, iregBits210(greg), 
+                              iregBits210(am->Aam.IR.reg));
+         p = emit32(p, am->Aam.IR.imm);
+         return p;
+      }
+      if ((am->Aam.IR.reg == hregAMD64_RSP()
+           || am->Aam.IR.reg == hregAMD64_R12())
+          && fits8bits(am->Aam.IR.imm)) {
+ 	 *p++ = mkModRegRM(1, iregBits210(greg), 4);
+         *p++ = 0x24;
+         *p++ = toUChar(am->Aam.IR.imm & 0xFF);
+         return p;
+      }
+      if (/* (am->Aam.IR.reg == hregAMD64_RSP()
+	     || wait for test case for RSP case */
+          am->Aam.IR.reg == hregAMD64_R12()) {
+ 	 *p++ = mkModRegRM(2, iregBits210(greg), 4);
+         *p++ = 0x24;
+         p = emit32(p, am->Aam.IR.imm);
+         return p;
+      }
+      ppAMD64AMode(am);
+      vpanic("doAMode_M: can't emit amode IR");
+      /*NOTREACHED*/
+   }
+   if (am->tag == Aam_IRRS) {
+      if (fits8bits(am->Aam.IRRS.imm)
+          && am->Aam.IRRS.index != hregAMD64_RSP()) {
+         *p++ = mkModRegRM(1, iregBits210(greg), 4);
+         *p++ = mkSIB(am->Aam.IRRS.shift, am->Aam.IRRS.index, 
+                                          am->Aam.IRRS.base);
+         *p++ = toUChar(am->Aam.IRRS.imm & 0xFF);
+         return p;
+      }
+      if (am->Aam.IRRS.index != hregAMD64_RSP()) {
+         *p++ = mkModRegRM(2, iregBits210(greg), 4);
+         *p++ = mkSIB(am->Aam.IRRS.shift, am->Aam.IRRS.index,
+                                          am->Aam.IRRS.base);
+         p = emit32(p, am->Aam.IRRS.imm);
+         return p;
+      }
+      ppAMD64AMode(am);
+      vpanic("doAMode_M: can't emit amode IRRS");
+      /*NOTREACHED*/
+   }
+   vpanic("doAMode_M: unknown amode");
+   /*NOTREACHED*/
+}
+
+
+/* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
+static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg ) 
+{
+   *p++ = mkModRegRM(3, iregBits210(greg), iregBits210(ereg));
+   return p;
+}
+
+
+/* Clear the W bit on a REX byte, thereby changing the operand size
+   back to whatever that instruction's default operand size is. */
+static inline UChar clearWBit ( UChar rex )
+{
+   return toUChar(rex & ~(1<<3));
+}
+
+
+/* Make up a REX byte, with W=1 (size=64), for a (greg,amode) pair. */
+static UChar rexAMode_M ( HReg greg, AMD64AMode* am )
+{
+   if (am->tag == Aam_IR) {
+      UChar W = 1;  /* we want 64-bit mode */
+      UChar R = iregBit3(greg);
+      UChar X = 0; /* not relevant */
+      UChar B = iregBit3(am->Aam.IR.reg);
+      return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
+   }
+   if (am->tag == Aam_IRRS) {
+      UChar W = 1;  /* we want 64-bit mode */
+      UChar R = iregBit3(greg);
+      UChar X = iregBit3(am->Aam.IRRS.index);
+      UChar B = iregBit3(am->Aam.IRRS.base);
+      return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
+   }
+   vassert(0);
+   return 0; /*NOTREACHED*/
+}
+
+/* Make up a REX byte, with W=1 (size=64), for a (greg,ereg) pair. */
+static UChar rexAMode_R ( HReg greg, HReg ereg )
+{
+   UChar W = 1;  /* we want 64-bit mode */
+   UChar R = iregBit3(greg);
+   UChar X = 0; /* not relevant */
+   UChar B = iregBit3(ereg);
+   return toUChar(0x40 + ((W << 3) | (R << 2) | (X << 1) | (B << 0)));
+}
+
+
+/* Emit ffree %st(N) */
+static UChar* do_ffree_st ( UChar* p, Int n )
+{
+   vassert(n >= 0 && n <= 7);
+   *p++ = 0xDD;
+   *p++ = toUChar(0xC0 + n);
+   return p;
+}
+
+//.. /* Emit fstp %st(i), 1 <= i <= 7 */
+//.. static UChar* do_fstp_st ( UChar* p, Int i )
+//.. {
+//..    vassert(1 <= i && i <= 7);
+//..    *p++ = 0xDD;
+//..    *p++ = 0xD8+i;
+//..    return p;
+//.. }
+//.. 
+//.. /* Emit fld %st(i), 0 <= i <= 6 */
+//.. static UChar* do_fld_st ( UChar* p, Int i )
+//.. {
+//..    vassert(0 <= i && i <= 6);
+//..    *p++ = 0xD9;
+//..    *p++ = 0xC0+i;
+//..    return p;
+//.. }
+//.. 
+//.. /* Emit f<op> %st(0) */
+//.. static UChar* do_fop1_st ( UChar* p, AMD64FpOp op )
+//.. {
+//..    switch (op) {
+//..       case Xfp_NEG:    *p++ = 0xD9; *p++ = 0xE0; break;
+//..       case Xfp_ABS:    *p++ = 0xD9; *p++ = 0xE1; break;
+//..       case Xfp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
+//..       case Xfp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
+//..       case Xfp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
+//..       case Xfp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
+//..       case Xfp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
+//..       case Xfp_MOV:    break;
+//..       case Xfp_TAN:    p = do_ffree_st7(p); /* since fptan pushes 1.0 */
+//..                        *p++ = 0xD9; *p++ = 0xF2; /* fptan */
+//..                        *p++ = 0xD9; *p++ = 0xF7; /* fincstp */
+//..                        break;
+//..       default: vpanic("do_fop1_st: unknown op");
+//..    }
+//..    return p;
+//.. }
+//.. 
+//.. /* Emit f<op> %st(i), 1 <= i <= 5 */
+//.. static UChar* do_fop2_st ( UChar* p, AMD64FpOp op, Int i )
+//.. {
+//.. #  define fake(_n) mkHReg((_n), HRcInt32, False)
+//..    Int subopc;
+//..    switch (op) {
+//..       case Xfp_ADD: subopc = 0; break;
+//..       case Xfp_SUB: subopc = 4; break;
+//..       case Xfp_MUL: subopc = 1; break;
+//..       case Xfp_DIV: subopc = 6; break;
+//..       default: vpanic("do_fop2_st: unknown op");
+//..    }
+//..    *p++ = 0xD8;
+//..    p    = doAMode_R(p, fake(subopc), fake(i));
+//..    return p;
+//.. #  undef fake
+//.. }
+//.. 
+//.. /* Push a 32-bit word on the stack.  The word depends on tags[3:0];
+//.. each byte is either 0x00 or 0xFF depending on the corresponding bit in tags[].
+//.. */
+//.. static UChar* push_word_from_tags ( UChar* p, UShort tags )
+//.. {
+//..    UInt w;
+//..    vassert(0 == (tags & ~0xF));
+//..    if (tags == 0) {
+//..       /* pushl $0x00000000 */
+//..       *p++ = 0x6A;
+//..       *p++ = 0x00;
+//..    }
+//..    else 
+//..    /* pushl $0xFFFFFFFF */
+//..    if (tags == 0xF) {
+//..       *p++ = 0x6A;
+//..       *p++ = 0xFF;
+//..    } else {
+//..       vassert(0); /* awaiting test case */
+//..       w = 0;
+//..       if (tags & 1) w |= 0x000000FF;
+//..       if (tags & 2) w |= 0x0000FF00;
+//..       if (tags & 4) w |= 0x00FF0000;
+//..       if (tags & 8) w |= 0xFF000000;
+//..       *p++ = 0x68;
+//..       p = emit32(p, w);
+//..    }
+//..    return p;
+//.. }
+
+/* Emit an instruction into buf and return the number of bytes used.
+   Note that buf is not the insn's final place, and therefore it is
+   imperative to emit position-independent code. */
+
+Int emit_AMD64Instr ( UChar* buf, Int nbuf, AMD64Instr* i, 
+                      Bool mode64, void* dispatch )
+{
+   UInt /*irno,*/ opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
+   UInt   xtra;
+   UInt   reg;
+   UChar  rex;
+   UChar* p = &buf[0];
+   UChar* ptmp;
+   Int    j;
+   vassert(nbuf >= 32);
+   vassert(mode64 == True);
+
+   /* Wrap an integer as a int register, for use assembling
+      GrpN insns, in which the greg field is used as a sub-opcode
+      and does not really contain a register. */
+#  define fake(_n) mkHReg((_n), HRcInt64, False)
+
+   /* vex_printf("asm  "); ppAMD64Instr(i, mode64); vex_printf("\n"); */
+
+   switch (i->tag) {
+
+   case Ain_Imm64:
+      *p++ = toUChar(0x48 + (1 & iregBit3(i->Ain.Imm64.dst)));
+      *p++ = toUChar(0xB8 + iregBits210(i->Ain.Imm64.dst));
+      p = emit64(p, i->Ain.Imm64.imm64);
+      goto done;
+
+   case Ain_Alu64R:
+      /* Deal specially with MOV */
+      if (i->Ain.Alu64R.op == Aalu_MOV) {
+         switch (i->Ain.Alu64R.src->tag) {
+            case Armi_Imm:
+               if (0 == (i->Ain.Alu64R.src->Armi.Imm.imm32 & ~0xFFF)) {
+                  /* Actually we could use this form for constants in
+                     the range 0 through 0x7FFFFFFF inclusive, but
+                     limit it to a small range for verifiability
+                     purposes. */
+                  /* Generate "movl $imm32, 32-bit-register" and let
+                     the default zero-extend rule cause the upper half
+                     of the dst to be zeroed out too.  This saves 1
+                     and sometimes 2 bytes compared to the more
+                     obvious encoding in the 'else' branch. */
+                  if (1 & iregBit3(i->Ain.Alu64R.dst))
+                     *p++ = 0x41;
+                  *p++ = 0xB8 + iregBits210(i->Ain.Alu64R.dst);
+                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
+               } else {
+                  *p++ = toUChar(0x48 + (1 & iregBit3(i->Ain.Alu64R.dst)));
+                  *p++ = 0xC7;
+                  *p++ = toUChar(0xC0 + iregBits210(i->Ain.Alu64R.dst));
+                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
+               }
+               goto done;
+            case Armi_Reg:
+               *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
+                                  i->Ain.Alu64R.dst );
+               *p++ = 0x89;
+               p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
+                                i->Ain.Alu64R.dst);
+               goto done;
+            case Armi_Mem:
+               *p++ = rexAMode_M(i->Ain.Alu64R.dst,
+                                 i->Ain.Alu64R.src->Armi.Mem.am);
+               *p++ = 0x8B;
+               p = doAMode_M(p, i->Ain.Alu64R.dst, 
+                                i->Ain.Alu64R.src->Armi.Mem.am);
+               goto done;
+            default:
+               goto bad;
+         }
+      }
+      /* MUL */
+      if (i->Ain.Alu64R.op == Aalu_MUL) {
+         switch (i->Ain.Alu64R.src->tag) {
+            case Armi_Reg:
+               *p++ = rexAMode_R( i->Ain.Alu64R.dst,
+                                  i->Ain.Alu64R.src->Armi.Reg.reg);
+               *p++ = 0x0F;
+               *p++ = 0xAF;
+               p = doAMode_R(p, i->Ain.Alu64R.dst,
+                                i->Ain.Alu64R.src->Armi.Reg.reg);
+               goto done;
+            case Armi_Mem:
+               *p++ = rexAMode_M(i->Ain.Alu64R.dst,
+                                 i->Ain.Alu64R.src->Armi.Mem.am);
+               *p++ = 0x0F;
+               *p++ = 0xAF;
+               p = doAMode_M(p, i->Ain.Alu64R.dst,
+                                i->Ain.Alu64R.src->Armi.Mem.am);
+               goto done;
+            case Armi_Imm:
+               if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
+                  *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
+                  *p++ = 0x6B;
+                  p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
+                  *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
+               } else {
+                  *p++ = rexAMode_R(i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
+                  *p++ = 0x69;
+                  p = doAMode_R(p, i->Ain.Alu64R.dst, i->Ain.Alu64R.dst);
+                  p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
+               }
+               goto done;
+            default:
+               goto bad;
+         }
+      }
+      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
+      opc = opc_rr = subopc_imm = opc_imma = 0;
+      switch (i->Ain.Alu64R.op) {
+         case Aalu_ADC: opc = 0x13; opc_rr = 0x11; 
+                        subopc_imm = 2; opc_imma = 0x15; break;
+         case Aalu_ADD: opc = 0x03; opc_rr = 0x01; 
+                        subopc_imm = 0; opc_imma = 0x05; break;
+         case Aalu_SUB: opc = 0x2B; opc_rr = 0x29; 
+                        subopc_imm = 5; opc_imma = 0x2D; break;
+         case Aalu_SBB: opc = 0x1B; opc_rr = 0x19; 
+                        subopc_imm = 3; opc_imma = 0x1D; break;
+         case Aalu_AND: opc = 0x23; opc_rr = 0x21; 
+                        subopc_imm = 4; opc_imma = 0x25; break;
+         case Aalu_XOR: opc = 0x33; opc_rr = 0x31; 
+                        subopc_imm = 6; opc_imma = 0x35; break;
+         case Aalu_OR:  opc = 0x0B; opc_rr = 0x09; 
+                        subopc_imm = 1; opc_imma = 0x0D; break;
+         case Aalu_CMP: opc = 0x3B; opc_rr = 0x39; 
+                        subopc_imm = 7; opc_imma = 0x3D; break;
+         default: goto bad;
+      }
+      switch (i->Ain.Alu64R.src->tag) {
+         case Armi_Imm:
+            if (i->Ain.Alu64R.dst == hregAMD64_RAX()
+                && !fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
+               goto bad; /* FIXME: awaiting test case */
+               *p++ = toUChar(opc_imma);
+               p = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
+            } else
+            if (fits8bits(i->Ain.Alu64R.src->Armi.Imm.imm32)) {
+               *p++ = rexAMode_R( fake(0), i->Ain.Alu64R.dst );
+               *p++ = 0x83; 
+               p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu64R.dst);
+               *p++ = toUChar(0xFF & i->Ain.Alu64R.src->Armi.Imm.imm32);
+            } else {
+               *p++ = rexAMode_R( fake(0), i->Ain.Alu64R.dst);
+               *p++ = 0x81; 
+               p    = doAMode_R(p, fake(subopc_imm), i->Ain.Alu64R.dst);
+               p    = emit32(p, i->Ain.Alu64R.src->Armi.Imm.imm32);
+            }
+            goto done;
+         case Armi_Reg:
+            *p++ = rexAMode_R( i->Ain.Alu64R.src->Armi.Reg.reg,
+                               i->Ain.Alu64R.dst);
+            *p++ = toUChar(opc_rr);
+            p = doAMode_R(p, i->Ain.Alu64R.src->Armi.Reg.reg,
+                             i->Ain.Alu64R.dst);
+            goto done;
+         case Armi_Mem:
+            *p++ = rexAMode_M( i->Ain.Alu64R.dst,
+                               i->Ain.Alu64R.src->Armi.Mem.am);
+            *p++ = toUChar(opc);
+            p = doAMode_M(p, i->Ain.Alu64R.dst,
+                             i->Ain.Alu64R.src->Armi.Mem.am);
+            goto done;
+         default: 
+            goto bad;
+      }
+      break;
+
+   case Ain_Alu64M:
+      /* Deal specially with MOV */
+      if (i->Ain.Alu64M.op == Aalu_MOV) {
+         switch (i->Ain.Alu64M.src->tag) {
+            case Ari_Reg:
+               *p++ = rexAMode_M(i->Ain.Alu64M.src->Ari.Reg.reg,
+                                 i->Ain.Alu64M.dst);
+               *p++ = 0x89;
+               p = doAMode_M(p, i->Ain.Alu64M.src->Ari.Reg.reg,
+                                i->Ain.Alu64M.dst);
+               goto done;
+            case Ari_Imm:
+               *p++ = rexAMode_M(fake(0), i->Ain.Alu64M.dst);
+               *p++ = 0xC7;
+               p = doAMode_M(p, fake(0), i->Ain.Alu64M.dst);
+               p = emit32(p, i->Ain.Alu64M.src->Ari.Imm.imm32);
+               goto done;
+            default: 
+               goto bad;
+         }
+      }
+//..       /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
+//..          allowed here. */
+//..       opc = subopc_imm = opc_imma = 0;
+//..       switch (i->Xin.Alu32M.op) {
+//..          case Xalu_ADD: opc = 0x01; subopc_imm = 0; break;
+//..          case Xalu_SUB: opc = 0x29; subopc_imm = 5; break;
+//..          default: goto bad;
+//..       }
+//..       switch (i->Xin.Alu32M.src->tag) {
+//..          case Xri_Reg:
+//..             *p++ = opc;
+//..             p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
+//..                              i->Xin.Alu32M.dst);
+//..             goto done;
+//..          case Xri_Imm:
+//..             if (fits8bits(i->Xin.Alu32M.src->Xri.Imm.imm32)) {
+//..                *p++ = 0x83;
+//..                p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
+//..                *p++ = 0xFF & i->Xin.Alu32M.src->Xri.Imm.imm32;
+//..                goto done;
+//..             } else {
+//..                *p++ = 0x81;
+//..                p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
+//..                p    = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
+//..                goto done;
+//..             }
+//..          default: 
+//..             goto bad;
+//..       }
+      break;
+
+   case Ain_Sh64:
+      opc_cl = opc_imm = subopc = 0;
+      switch (i->Ain.Sh64.op) {
+         case Ash_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
+         case Ash_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
+         case Ash_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
+         default: goto bad;
+      }
+      if (i->Ain.Sh64.src == 0) {
+         *p++ = rexAMode_R(fake(0), i->Ain.Sh64.dst);
+         *p++ = toUChar(opc_cl);
+         p = doAMode_R(p, fake(subopc), i->Ain.Sh64.dst);
+         goto done;
+      } else {
+         *p++ = rexAMode_R(fake(0), i->Ain.Sh64.dst);
+         *p++ = toUChar(opc_imm);
+         p = doAMode_R(p, fake(subopc), i->Ain.Sh64.dst);
+         *p++ = (UChar)(i->Ain.Sh64.src);
+         goto done;
+      }
+      break;
+
+   case Ain_Test64:
+      /* testq sign-extend($imm32), %reg */
+      *p++ = rexAMode_R(fake(0), i->Ain.Test64.dst);
+      *p++ = 0xF7;
+      p = doAMode_R(p, fake(0), i->Ain.Test64.dst);
+      p = emit32(p, i->Ain.Test64.imm32);
+      goto done;
+
+   case Ain_Unary64:
+      if (i->Ain.Unary64.op == Aun_NOT) {
+         *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst);
+         *p++ = 0xF7;
+         p = doAMode_R(p, fake(2), i->Ain.Unary64.dst);
+         goto done;
+      }
+      if (i->Ain.Unary64.op == Aun_NEG) {
+         *p++ = rexAMode_R(fake(0), i->Ain.Unary64.dst);
+         *p++ = 0xF7;
+         p = doAMode_R(p, fake(3), i->Ain.Unary64.dst);
+         goto done;
+      }
+      break;
+
+   case Ain_Lea64:
+      *p++ = rexAMode_M(i->Ain.Lea64.dst, i->Ain.Lea64.am);
+      *p++ = 0x8D;
+      p = doAMode_M(p, i->Ain.Lea64.dst, i->Ain.Lea64.am);
+      goto done;
+
+   case Ain_MulL:
+      subopc = i->Ain.MulL.syned ? 5 : 4;
+      switch (i->Ain.MulL.src->tag)  {
+         case Arm_Mem:
+            *p++ = rexAMode_M( fake(0),
+                               i->Ain.MulL.src->Arm.Mem.am);
+            *p++ = 0xF7;
+            p = doAMode_M(p, fake(subopc),
+                             i->Ain.MulL.src->Arm.Mem.am);
+            goto done;
+         case Arm_Reg:
+            *p++ = rexAMode_R(fake(0), 
+                              i->Ain.MulL.src->Arm.Reg.reg);
+            *p++ = 0xF7;
+            p = doAMode_R(p, fake(subopc), 
+                             i->Ain.MulL.src->Arm.Reg.reg);
+            goto done;
+         default:
+            goto bad;
+      }
+      break;
+
+   case Ain_Div:
+      subopc = i->Ain.Div.syned ? 7 : 6;
+      if (i->Ain.Div.sz == 4) {
+         switch (i->Ain.Div.src->tag)  {
+            case Arm_Mem:
+               goto bad;
+               /*FIXME*/
+               *p++ = 0xF7;
+               p = doAMode_M(p, fake(subopc),
+                                i->Ain.Div.src->Arm.Mem.am);
+               goto done;
+            case Arm_Reg:
+               *p++ = clearWBit(
+                      rexAMode_R( fake(0), i->Ain.Div.src->Arm.Reg.reg));
+               *p++ = 0xF7;
+               p = doAMode_R(p, fake(subopc), 
+                                i->Ain.Div.src->Arm.Reg.reg);
+               goto done;
+            default:
+               goto bad;
+         }
+      }
+      if (i->Ain.Div.sz == 8) {
+         switch (i->Ain.Div.src->tag)  {
+            case Arm_Mem:
+               *p++ = rexAMode_M( fake(0),
+                                  i->Ain.Div.src->Arm.Mem.am);
+               *p++ = 0xF7;
+               p = doAMode_M(p, fake(subopc),
+                                i->Ain.Div.src->Arm.Mem.am);
+               goto done;
+            case Arm_Reg:
+               *p++ = rexAMode_R( fake(0), 
+                                  i->Ain.Div.src->Arm.Reg.reg);
+               *p++ = 0xF7;
+               p = doAMode_R(p, fake(subopc), 
+                                i->Ain.Div.src->Arm.Reg.reg);
+               goto done;
+            default:
+               goto bad;
+         }
+      }
+      break;
+
+//..    case Xin_Sh3232:
+//..       vassert(i->Xin.Sh3232.op == Xsh_SHL || i->Xin.Sh3232.op == Xsh_SHR);
+//..       if (i->Xin.Sh3232.amt == 0) {
+//..          /* shldl/shrdl by %cl */
+//..          *p++ = 0x0F;
+//..          if (i->Xin.Sh3232.op == Xsh_SHL) {
+//..             *p++ = 0xA5;
+//..          } else {
+//..             *p++ = 0xAD;
+//..          }
+//..          p = doAMode_R(p, i->Xin.Sh3232.src, i->Xin.Sh3232.dst);
+//..          goto done;
+//..       }
+//..       break;
+
+   case Ain_Push:
+      switch (i->Ain.Push.src->tag) {
+         case Armi_Mem: 
+            *p++ = clearWBit(
+                   rexAMode_M(fake(0), i->Ain.Push.src->Armi.Mem.am));
+            *p++ = 0xFF;
+            p = doAMode_M(p, fake(6), i->Ain.Push.src->Armi.Mem.am);
+            goto done;
+         case Armi_Imm:
+            *p++ = 0x68;
+            p = emit32(p, i->Ain.Push.src->Armi.Imm.imm32);
+            goto done;
+         case Armi_Reg:
+            *p++ = toUChar(0x40 + (1 & iregBit3(i->Ain.Push.src->Armi.Reg.reg)));
+            *p++ = toUChar(0x50 + iregBits210(i->Ain.Push.src->Armi.Reg.reg));
+            goto done;
+        default: 
+            goto bad;
+      }
+
+   case Ain_Call: {
+      /* As per detailed comment for Ain_Call in
+         getRegUsage_AMD64Instr above, %r11 is used as an address
+         temporary. */
+      /* jump over the following two insns if the condition does not
+         hold */
+      Bool shortImm = fitsIn32Bits(i->Ain.Call.target);
+      if (i->Ain.Call.cond != Acc_ALWAYS) {
+         *p++ = toUChar(0x70 + (0xF & (i->Ain.Call.cond ^ 1)));
+         *p++ = shortImm ? 10 : 13;
+         /* 10 or 13 bytes in the next two insns */
+      }
+      if (shortImm) {
+         /* 7 bytes: movl sign-extend(imm32), %r11 */
+         *p++ = 0x49;
+         *p++ = 0xC7;
+         *p++ = 0xC3;
+         p = emit32(p, (UInt)i->Ain.Call.target);
+      } else {
+         /* 10 bytes: movabsq $target, %r11 */
+         *p++ = 0x49;
+         *p++ = 0xBB;
+         p = emit64(p, i->Ain.Call.target);
+      }
+      /* 3 bytes: call *%r11 */
+      *p++ = 0x41;
+      *p++ = 0xFF;
+      *p++ = 0xD3;
+      goto done;
+   }
+
+   case Ain_Goto:
+      /* Use ptmp for backpatching conditional jumps. */
+      ptmp = NULL;
+
+      /* First off, if this is conditional, create a conditional
+         jump over the rest of it. */
+      if (i->Ain.Goto.cond != Acc_ALWAYS) {
+         /* jmp fwds if !condition */
+         *p++ = toUChar(0x70 + (i->Ain.Goto.cond ^ 1));
+         ptmp = p; /* fill in this bit later */
+         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
+      }
+
+      /* If a non-boring, set %rbp (the guest state pointer)
+         appropriately.  Since these numbers are all small positive
+         integers, we can get away with "movl $N, %ebp" rather than
+         the longer "movq $N, %rbp". */
+      /* movl $magic_number, %ebp */
+      switch (i->Ain.Goto.jk) {
+         case Ijk_ClientReq: 
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_CLIENTREQ); break;
+         case Ijk_Sys_syscall: 
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_SYS_SYSCALL); break;
+         case Ijk_Sys_int32: 
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_SYS_INT32); break;
+         case Ijk_Yield: 
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_YIELD); break;
+         case Ijk_EmWarn:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_EMWARN); break;
+         case Ijk_MapFail:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_MAPFAIL); break;
+         case Ijk_NoDecode:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_NODECODE); break;
+         case Ijk_TInval:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_TINVAL); break;
+         case Ijk_NoRedir:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_NOREDIR); break;
+         case Ijk_SigTRAP:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_SIGTRAP); break;
+         case Ijk_SigSEGV:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_SIGSEGV); break;
+         case Ijk_Ret:
+         case Ijk_Call:
+         case Ijk_Boring:
+            break;
+         default: 
+            ppIRJumpKind(i->Ain.Goto.jk);
+            vpanic("emit_AMD64Instr.Ain_Goto: unknown jump kind");
+      }
+
+      /* Get the destination address into %rax */
+      if (i->Ain.Goto.dst->tag == Ari_Imm) {
+         /* movl sign-ext($immediate), %rax ; ret */
+         *p++ = 0x48;
+         *p++ = 0xC7;
+         *p++ = 0xC0;
+         p = emit32(p, i->Ain.Goto.dst->Ari.Imm.imm32);
+      } else {
+         vassert(i->Ain.Goto.dst->tag == Ari_Reg);
+         /* movq %reg, %rax ; ret */
+         if (i->Ain.Goto.dst->Ari.Reg.reg != hregAMD64_RAX()) {
+            *p++ = rexAMode_R(i->Ain.Goto.dst->Ari.Reg.reg, hregAMD64_RAX());
+            *p++ = 0x89;
+            p = doAMode_R(p, i->Ain.Goto.dst->Ari.Reg.reg, hregAMD64_RAX());
+         }
+      }
+
+      /* Get the dispatcher address into %rdx.  This has to happen
+         after the load of %rax since %rdx might be carrying the value
+         destined for %rax immediately prior to this Ain_Goto. */
+      vassert(sizeof(ULong) == sizeof(void*));
+      vassert(dispatch != NULL);
+
+      if (fitsIn32Bits(Ptr_to_ULong(dispatch))) {
+         /* movl sign-extend(imm32), %rdx */
+         *p++ = 0x48;
+         *p++ = 0xC7;
+         *p++ = 0xC2;
+         p = emit32(p, (UInt)Ptr_to_ULong(dispatch));
+      } else {
+         /* movabsq $imm64, %rdx */
+         *p++ = 0x48;
+         *p++ = 0xBA;
+         p = emit64(p, Ptr_to_ULong(dispatch));
+      }
+      /* jmp *%rdx */
+      *p++ = 0xFF;
+      *p++ = 0xE2;
+
+      /* Fix up the conditional jump, if there was one. */
+      if (i->Ain.Goto.cond != Acc_ALWAYS) {
+         Int delta = p - ptmp;
+         vassert(delta > 0 && delta < 30);
+         *ptmp = toUChar(delta-1);
+      }
+      goto done;
+
+   case Ain_CMov64:
+      vassert(i->Ain.CMov64.cond != Acc_ALWAYS);
+      if (i->Ain.CMov64.src->tag == Arm_Reg) {
+         *p++ = rexAMode_R(i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Reg.reg);
+         *p++ = 0x0F;
+         *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
+         p = doAMode_R(p, i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Reg.reg);
+         goto done;
+      }
+      if (i->Ain.CMov64.src->tag == Arm_Mem) {
+         *p++ = rexAMode_M(i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Mem.am);
+         *p++ = 0x0F;
+         *p++ = toUChar(0x40 + (0xF & i->Ain.CMov64.cond));
+         p = doAMode_M(p, i->Ain.CMov64.dst, i->Ain.CMov64.src->Arm.Mem.am);
+         goto done;
+      }
+      break;
+
+   case Ain_MovxLQ:
+      /* No, _don't_ ask me why the sense of the args has to be
+         different in the S vs Z case.  I don't know. */
+      if (i->Ain.MovxLQ.syned) {
+         /* Need REX.W = 1 here, but rexAMode_R does that for us. */
+         *p++ = rexAMode_R(i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
+         *p++ = 0x63;
+         p = doAMode_R(p, i->Ain.MovxLQ.dst, i->Ain.MovxLQ.src);
+      } else {
+         /* Produce a 32-bit reg-reg move, since the implicit
+            zero-extend does what we want. */
+         *p++ = clearWBit (
+                   rexAMode_R(i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst));
+         *p++ = 0x89;
+         p = doAMode_R(p, i->Ain.MovxLQ.src, i->Ain.MovxLQ.dst);
+      }
+      goto done;
+
+   case Ain_LoadEX:
+      if (i->Ain.LoadEX.szSmall == 1 && !i->Ain.LoadEX.syned) {
+         /* movzbq */
+         *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src); 
+         *p++ = 0x0F;
+         *p++ = 0xB6;
+         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src); 
+         goto done;
+      }
+      if (i->Ain.LoadEX.szSmall == 2 && !i->Ain.LoadEX.syned) {
+         /* movzwq */
+         *p++ = rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src); 
+         *p++ = 0x0F;
+         *p++ = 0xB7;
+         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src); 
+         goto done;
+      }
+      if (i->Ain.LoadEX.szSmall == 4 && !i->Ain.LoadEX.syned) {
+         /* movzlq */
+         /* This isn't really an existing AMD64 instruction per se.
+            Rather, we have to do a 32-bit load.  Because a 32-bit
+            write implicitly clears the upper 32 bits of the target
+            register, we get what we want. */
+         *p++ = clearWBit(
+                rexAMode_M(i->Ain.LoadEX.dst, i->Ain.LoadEX.src));
+         *p++ = 0x8B;
+         p = doAMode_M(p, i->Ain.LoadEX.dst, i->Ain.LoadEX.src);
+         goto done;
+      }
+      break;
+
+   case Ain_Set64:
+      /* Make the destination register be 1 or 0, depending on whether
+         the relevant condition holds.  Complication: the top 56 bits
+         of the destination should be forced to zero, but doing 'xorq
+         %r,%r' kills the flag(s) we are about to read.  Sigh.  So
+         start off my moving $0 into the dest. */
+      reg = iregBits3210(i->Ain.Set64.dst);
+      vassert(reg < 16);
+
+      /* movq $0, %dst */
+      *p++ = toUChar(reg >= 8 ? 0x49 : 0x48);
+      *p++ = 0xC7;
+      *p++ = toUChar(0xC0 + (reg & 7));
+      p = emit32(p, 0);
+
+      /* setb lo8(%dst) */
+      /* note, 8-bit register rex trickyness.  Be careful here. */
+      *p++ = toUChar(reg >= 8 ? 0x41 : 0x40);
+      *p++ = 0x0F; 
+      *p++ = toUChar(0x90 + (0x0F & i->Ain.Set64.cond));
+      *p++ = toUChar(0xC0 + (reg & 7));
+      goto done;
+
+   case Ain_Bsfr64:
+      *p++ = rexAMode_R(i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
+      *p++ = 0x0F;
+      if (i->Ain.Bsfr64.isFwds) {
+         *p++ = 0xBC;
+      } else {
+         *p++ = 0xBD;
+      }
+      p = doAMode_R(p, i->Ain.Bsfr64.dst, i->Ain.Bsfr64.src);
+      goto done;
+
+   case Ain_MFence:
+      /* mfence */
+      *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
+      goto done;
+
+   case Ain_ACAS:
+      /* lock */
+      *p++ = 0xF0;
+      if (i->Ain.ACAS.sz == 2) *p++ = 0x66; 
+      /* cmpxchg{b,w,l,q} %rbx,mem.  Expected-value in %rax, new value
+         in %rbx.  The new-value register is hardwired to be %rbx
+         since dealing with byte integer registers is too much hassle,
+         so we force the register operand to %rbx (could equally be
+         %rcx or %rdx). */
+      rex = rexAMode_M( hregAMD64_RBX(), i->Ain.ACAS.addr );
+      if (i->Ain.ACAS.sz != 8)
+         rex = clearWBit(rex);
+
+      *p++ = rex; /* this can emit 0x40, which is pointless. oh well. */
+      *p++ = 0x0F;
+      if (i->Ain.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
+      p = doAMode_M(p, hregAMD64_RBX(), i->Ain.ACAS.addr);
+      goto done;
+
+   case Ain_DACAS:
+      /* lock */
+      *p++ = 0xF0;
+      /* cmpxchg{8,16}b m{64,128}.  Expected-value in %rdx:%rax, new
+         value in %rcx:%rbx.  All 4 regs are hardwired in the ISA, so
+         aren't encoded in the insn. */
+      rex = rexAMode_M( fake(1), i->Ain.ACAS.addr );
+      if (i->Ain.ACAS.sz != 8)
+         rex = clearWBit(rex);
+      *p++ = rex;
+      *p++ = 0x0F;
+      *p++ = 0xC7;
+      p = doAMode_M(p, fake(1), i->Ain.DACAS.addr);
+      goto done;
+
+   case Ain_A87Free:
+      vassert(i->Ain.A87Free.nregs > 0 && i->Ain.A87Free.nregs <= 7);
+      for (j = 0; j < i->Ain.A87Free.nregs; j++) {
+         p = do_ffree_st(p, 7-j);
+      }
+      goto done;
+
+   case Ain_A87PushPop:
+      vassert(i->Ain.A87PushPop.szB == 8 || i->Ain.A87PushPop.szB == 4);
+      if (i->Ain.A87PushPop.isPush) {
+         /* Load from memory into %st(0): flds/fldl amode */
+         *p++ = clearWBit(
+                   rexAMode_M(fake(0), i->Ain.A87PushPop.addr) );
+         *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
+	 p = doAMode_M(p, fake(0)/*subopcode*/, i->Ain.A87PushPop.addr);
+      } else {
+         /* Dump %st(0) to memory: fstps/fstpl amode */
+         *p++ = clearWBit(
+                   rexAMode_M(fake(3), i->Ain.A87PushPop.addr) );
+         *p++ = i->Ain.A87PushPop.szB == 4 ? 0xD9 : 0xDD;
+         p = doAMode_M(p, fake(3)/*subopcode*/, i->Ain.A87PushPop.addr);
+         goto done;
+      }
+      goto done;
+
+   case Ain_A87FpOp:
+      switch (i->Ain.A87FpOp.op) {
+         case Afp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
+         case Afp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
+         case Afp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
+         case Afp_TAN:    *p++ = 0xD9; *p++ = 0xF2; break;
+         case Afp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
+         case Afp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
+         case Afp_SCALE:  *p++ = 0xD9; *p++ = 0xFD; break;
+         case Afp_ATAN:   *p++ = 0xD9; *p++ = 0xF3; break;
+         case Afp_YL2X:   *p++ = 0xD9; *p++ = 0xF1; break;
+         case Afp_YL2XP1: *p++ = 0xD9; *p++ = 0xF9; break;
+         case Afp_PREM:   *p++ = 0xD9; *p++ = 0xF8; break;
+         case Afp_PREM1:  *p++ = 0xD9; *p++ = 0xF5; break;
+         default: goto bad;
+      }
+      goto done;
+
+   case Ain_A87LdCW:
+      *p++ = clearWBit(
+                rexAMode_M(fake(5), i->Ain.A87LdCW.addr) );
+      *p++ = 0xD9;
+      p = doAMode_M(p, fake(5)/*subopcode*/, i->Ain.A87LdCW.addr);
+      goto done;
+
+   case Ain_A87StSW:
+      *p++ = clearWBit(
+                rexAMode_M(fake(7), i->Ain.A87StSW.addr) );
+      *p++ = 0xDD;
+      p = doAMode_M(p, fake(7)/*subopcode*/, i->Ain.A87StSW.addr);
+      goto done;
+
+   case Ain_Store:
+      if (i->Ain.Store.sz == 2) {
+         /* This just goes to show the crazyness of the instruction
+            set encoding.  We have to insert two prefix bytes, but be
+            careful to avoid a conflict in what the size should be, by
+            ensuring that REX.W = 0. */
+         *p++ = 0x66; /* override to 16-bits */
+	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
+         *p++ = 0x89;
+         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
+         goto done;
+      }
+      if (i->Ain.Store.sz == 4) {
+	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
+         *p++ = 0x89;
+         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
+         goto done;
+      }
+      if (i->Ain.Store.sz == 1) {
+         /* This is one place where it would be wrong to skip emitting
+            a rex byte of 0x40, since the mere presence of rex changes
+            the meaning of the byte register access.  Be careful. */
+	 *p++ = clearWBit( rexAMode_M( i->Ain.Store.src, i->Ain.Store.dst) );
+         *p++ = 0x88;
+         p = doAMode_M(p, i->Ain.Store.src, i->Ain.Store.dst);
+         goto done;
+      }
+      break;
+
+//..    case Xin_FpUnary:
+//..       /* gop %src, %dst
+//..          --> ffree %st7 ; fld %st(src) ; fop %st(0) ; fstp %st(1+dst)
+//..       */
+//..       p = do_ffree_st7(p);
+//..       p = do_fld_st(p, 0+hregNumber(i->Xin.FpUnary.src));
+//..       p = do_fop1_st(p, i->Xin.FpUnary.op);
+//..       p = do_fstp_st(p, 1+hregNumber(i->Xin.FpUnary.dst));
+//..       goto done;
+//.. 
+//..    case Xin_FpBinary:
+//..       if (i->Xin.FpBinary.op == Xfp_YL2X
+//..           || i->Xin.FpBinary.op == Xfp_YL2XP1) {
+//..          /* Have to do this specially. */
+//..          /* ffree %st7 ; fld %st(srcL) ; 
+//..             ffree %st7 ; fld %st(srcR+1) ; fyl2x{p1} ; fstp(1+dst) */
+//..          p = do_ffree_st7(p);
+//..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
+//..          p = do_ffree_st7(p);
+//..          p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
+//..          *p++ = 0xD9; 
+//..          *p++ = i->Xin.FpBinary.op==Xfp_YL2X ? 0xF1 : 0xF9;
+//..          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
+//..          goto done;
+//..       }
+//..       if (i->Xin.FpBinary.op == Xfp_ATAN) {
+//..          /* Have to do this specially. */
+//..          /* ffree %st7 ; fld %st(srcL) ; 
+//..             ffree %st7 ; fld %st(srcR+1) ; fpatan ; fstp(1+dst) */
+//..          p = do_ffree_st7(p);
+//..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
+//..          p = do_ffree_st7(p);
+//..          p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
+//..          *p++ = 0xD9; *p++ = 0xF3;
+//..          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
+//..          goto done;
+//..       }
+//..       if (i->Xin.FpBinary.op == Xfp_PREM
+//..           || i->Xin.FpBinary.op == Xfp_PREM1
+//..           || i->Xin.FpBinary.op == Xfp_SCALE) {
+//..          /* Have to do this specially. */
+//..          /* ffree %st7 ; fld %st(srcR) ; 
+//..             ffree %st7 ; fld %st(srcL+1) ; fprem/fprem1/fscale ; fstp(2+dst) ; 
+//..             fincstp ; ffree %st7 */
+//..          p = do_ffree_st7(p);
+//..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcR));
+//..          p = do_ffree_st7(p);
+//..          p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcL));
+//..          *p++ = 0xD9;
+//..          switch (i->Xin.FpBinary.op) {
+//..             case Xfp_PREM: *p++ = 0xF8; break;
+//..             case Xfp_PREM1: *p++ = 0xF5; break;
+//..             case Xfp_SCALE: *p++ =  0xFD; break;
+//..             default: vpanic("emitAMD64Instr(FpBinary,PREM/PREM1/SCALE)");
+//..          }
+//..          p = do_fstp_st(p, 2+hregNumber(i->Xin.FpBinary.dst));
+//..          *p++ = 0xD9; *p++ = 0xF7;
+//..          p = do_ffree_st7(p);
+//..          goto done;
+//..       }
+//..       /* General case */
+//..       /* gop %srcL, %srcR, %dst
+//..          --> ffree %st7 ; fld %st(srcL) ; fop %st(1+srcR) ; fstp %st(1+dst)
+//..       */
+//..       p = do_ffree_st7(p);
+//..       p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
+//..       p = do_fop2_st(p, i->Xin.FpBinary.op, 
+//..                         1+hregNumber(i->Xin.FpBinary.srcR));
+//..       p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
+//..       goto done;
+//.. 
+//..    case Xin_FpLdSt:
+//..       vassert(i->Xin.FpLdSt.sz == 4 || i->Xin.FpLdSt.sz == 8);
+//..       if (i->Xin.FpLdSt.isLoad) {
+//..          /* Load from memory into %fakeN.  
+//..             --> ffree %st(7) ; fld{s/l} amode ; fstp st(N+1) 
+//..          */
+//..          p = do_ffree_st7(p);
+//..          *p++ = i->Xin.FpLdSt.sz==4 ? 0xD9 : 0xDD;
+//.. 	 p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
+//..          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdSt.reg));
+//..          goto done;
+//..       } else {
+//..          /* Store from %fakeN into memory.
+//..             --> ffree %st(7) ; fld st(N) ; fstp{l|s} amode
+//.. 	 */
+//..          p = do_ffree_st7(p);
+//..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdSt.reg));
+//..          *p++ = i->Xin.FpLdSt.sz==4 ? 0xD9 : 0xDD;
+//..          p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
+//..          goto done;
+//..       }
+//..       break;
+//.. 
+//..    case Xin_FpLdStI:
+//..       if (i->Xin.FpLdStI.isLoad) {
+//..          /* Load from memory into %fakeN, converting from an int.  
+//..             --> ffree %st(7) ; fild{w/l/ll} amode ; fstp st(N+1) 
+//..          */
+//..          switch (i->Xin.FpLdStI.sz) {
+//..             case 8:  opc = 0xDF; subopc_imm = 5; break;
+//..             case 4:  opc = 0xDB; subopc_imm = 0; break;
+//..             case 2:  vassert(0); opc = 0xDF; subopc_imm = 0; break;
+//..             default: vpanic("emitAMD64Instr(Xin_FpLdStI-load)");
+//..          }
+//..          p = do_ffree_st7(p);
+//..          *p++ = opc;
+//..          p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
+//..          p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdStI.reg));
+//..          goto done;
+//..       } else {
+//..          /* Store from %fakeN into memory, converting to an int.
+//..             --> ffree %st(7) ; fld st(N) ; fistp{w/l/ll} amode
+//.. 	 */
+//..          switch (i->Xin.FpLdStI.sz) {
+//..             case 8:  opc = 0xDF; subopc_imm = 7; break;
+//..             case 4:  opc = 0xDB; subopc_imm = 3; break;
+//..             case 2:  opc = 0xDF; subopc_imm = 3; break;
+//..             default: vpanic("emitAMD64Instr(Xin_FpLdStI-store)");
+//..          }
+//..          p = do_ffree_st7(p);
+//..          p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdStI.reg));
+//..          *p++ = opc;
+//..          p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
+//..          goto done;
+//..       }
+//..       break;
+//.. 
+//..    case Xin_Fp64to32:
+//..       /* ffree %st7 ; fld %st(src) */
+//..       p = do_ffree_st7(p);
+//..       p = do_fld_st(p, 0+fregNo(i->Xin.Fp64to32.src));
+//..       /* subl $4, %esp */
+//..       *p++ = 0x83; *p++ = 0xEC; *p++ = 0x04;
+//..       /* fstps (%esp) */
+//..       *p++ = 0xD9; *p++ = 0x1C; *p++ = 0x24;
+//..       /* flds (%esp) */
+//..       *p++ = 0xD9; *p++ = 0x04; *p++ = 0x24;
+//..       /* addl $4, %esp */
+//..       *p++ = 0x83; *p++ = 0xC4; *p++ = 0x04;
+//..       /* fstp %st(1+dst) */
+//..       p = do_fstp_st(p, 1+fregNo(i->Xin.Fp64to32.dst));
+//..       goto done;
+//.. 
+//..    case Xin_FpCMov:
+//..       /* jmp fwds if !condition */
+//..       *p++ = 0x70 + (i->Xin.FpCMov.cond ^ 1);
+//..       *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
+//..       ptmp = p;
+//.. 
+//..       /* ffree %st7 ; fld %st(src) ; fstp %st(1+dst) */
+//..       p = do_ffree_st7(p);
+//..       p = do_fld_st(p, 0+fregNo(i->Xin.FpCMov.src));
+//..       p = do_fstp_st(p, 1+fregNo(i->Xin.FpCMov.dst));
+//.. 
+//..       /* Fill in the jump offset. */
+//..       *(ptmp-1) = p - ptmp;
+//..       goto done;
+
+   case Ain_LdMXCSR:
+      *p++ = clearWBit(rexAMode_M( fake(0), i->Ain.LdMXCSR.addr));
+      *p++ = 0x0F;
+      *p++ = 0xAE;
+      p = doAMode_M(p, fake(2)/*subopcode*/, i->Ain.LdMXCSR.addr);
+      goto done;
+
+//..    case Xin_FpStSW_AX:
+//..       /* note, this emits fnstsw %ax, not fstsw %ax */
+//..       *p++ = 0xDF;
+//..       *p++ = 0xE0;
+//..       goto done;
+
+   case Ain_SseUComIS:
+      /* ucomi[sd] %srcL, %srcR ;  pushfq ; popq %dst */
+      /* ucomi[sd] %srcL, %srcR */
+      if (i->Ain.SseUComIS.sz == 8) {
+         *p++ = 0x66;
+      } else {
+         goto bad;
+         vassert(i->Ain.SseUComIS.sz == 4);
+      }
+      *p++ = clearWBit (
+             rexAMode_R( vreg2ireg(i->Ain.SseUComIS.srcL),
+                         vreg2ireg(i->Ain.SseUComIS.srcR) ));
+      *p++ = 0x0F;
+      *p++ = 0x2E;
+      p = doAMode_R(p, vreg2ireg(i->Ain.SseUComIS.srcL),
+                       vreg2ireg(i->Ain.SseUComIS.srcR) );
+      /* pushfq */
+      *p++ = 0x9C;
+      /* popq %dst */
+      *p++ = toUChar(0x40 + (1 & iregBit3(i->Ain.SseUComIS.dst)));
+      *p++ = toUChar(0x58 + iregBits210(i->Ain.SseUComIS.dst));
+      goto done;
+
+   case Ain_SseSI2SF:
+      /* cvssi2s[sd] %src, %dst */
+      rex = rexAMode_R( vreg2ireg(i->Ain.SseSI2SF.dst),
+                        i->Ain.SseSI2SF.src );
+      *p++ = toUChar(i->Ain.SseSI2SF.szD==4 ? 0xF3 : 0xF2);
+      *p++ = toUChar(i->Ain.SseSI2SF.szS==4 ? clearWBit(rex) : rex);
+      *p++ = 0x0F;
+      *p++ = 0x2A;
+      p = doAMode_R( p, vreg2ireg(i->Ain.SseSI2SF.dst),
+                        i->Ain.SseSI2SF.src );
+      goto done;
+
+   case Ain_SseSF2SI:
+      /* cvss[sd]2si %src, %dst */
+      rex = rexAMode_R( i->Ain.SseSF2SI.dst,
+                        vreg2ireg(i->Ain.SseSF2SI.src) );
+      *p++ = toUChar(i->Ain.SseSF2SI.szS==4 ? 0xF3 : 0xF2);
+      *p++ = toUChar(i->Ain.SseSF2SI.szD==4 ? clearWBit(rex) : rex);
+      *p++ = 0x0F;
+      *p++ = 0x2D;
+      p = doAMode_R( p, i->Ain.SseSF2SI.dst,
+                        vreg2ireg(i->Ain.SseSF2SI.src) );
+      goto done;
+
+   case Ain_SseSDSS:
+      /* cvtsd2ss/cvtss2sd %src, %dst */
+      *p++ = toUChar(i->Ain.SseSDSS.from64 ? 0xF2 : 0xF3);
+      *p++ = clearWBit(
+              rexAMode_R( vreg2ireg(i->Ain.SseSDSS.dst),
+                          vreg2ireg(i->Ain.SseSDSS.src) ));
+      *p++ = 0x0F;
+      *p++ = 0x5A;
+      p = doAMode_R( p, vreg2ireg(i->Ain.SseSDSS.dst),
+                        vreg2ireg(i->Ain.SseSDSS.src) );
+      goto done;
+
+//.. 
+//..    case Xin_FpCmp:
+//..       /* gcmp %fL, %fR, %dst
+//..          -> ffree %st7; fpush %fL ; fucomp %(fR+1) ; 
+//..             fnstsw %ax ; movl %eax, %dst 
+//..       */
+//..       /* ffree %st7 */
+//..       p = do_ffree_st7(p);
+//..       /* fpush %fL */
+//..       p = do_fld_st(p, 0+fregNo(i->Xin.FpCmp.srcL));
+//..       /* fucomp %(fR+1) */
+//..       *p++ = 0xDD;
+//..       *p++ = 0xE8 + (7 & (1+fregNo(i->Xin.FpCmp.srcR)));
+//..       /* fnstsw %ax */
+//..       *p++ = 0xDF;
+//..       *p++ = 0xE0;
+//..       /*  movl %eax, %dst */
+//..       *p++ = 0x89;
+//..       p = doAMode_R(p, hregAMD64_EAX(), i->Xin.FpCmp.dst);
+//..       goto done;
+//.. 
+//..    case Xin_SseConst: {
+//..       UShort con = i->Xin.SseConst.con;
+//..       p = push_word_from_tags(p, (con >> 12) & 0xF);
+//..       p = push_word_from_tags(p, (con >> 8) & 0xF);
+//..       p = push_word_from_tags(p, (con >> 4) & 0xF);
+//..       p = push_word_from_tags(p, con & 0xF);
+//..       /* movl (%esp), %xmm-dst */
+//..       *p++ = 0x0F;
+//..       *p++ = 0x10;
+//..       *p++ = 0x04 + 8 * (7 & vregNo(i->Xin.SseConst.dst));
+//..       *p++ = 0x24;
+//..       /* addl $16, %esp */
+//..       *p++ = 0x83;
+//..       *p++ = 0xC4;
+//..       *p++ = 0x10;
+//..       goto done;
+//..    }
+
+   case Ain_SseLdSt:
+      if (i->Ain.SseLdSt.sz == 8) {
+         *p++ = 0xF2;
+      } else
+      if (i->Ain.SseLdSt.sz == 4) {
+         *p++ = 0xF3;
+      } else
+      if (i->Ain.SseLdSt.sz != 16) {
+         vassert(0);
+      }
+      *p++ = clearWBit(
+             rexAMode_M( vreg2ireg(i->Ain.SseLdSt.reg), i->Ain.SseLdSt.addr));
+      *p++ = 0x0F; 
+      *p++ = toUChar(i->Ain.SseLdSt.isLoad ? 0x10 : 0x11);
+      p = doAMode_M(p, vreg2ireg(i->Ain.SseLdSt.reg), i->Ain.SseLdSt.addr);
+      goto done;
+
+   case Ain_SseLdzLO:
+      vassert(i->Ain.SseLdzLO.sz == 4 || i->Ain.SseLdzLO.sz == 8);
+      /* movs[sd] amode, %xmm-dst */
+      *p++ = toUChar(i->Ain.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
+      *p++ = clearWBit(
+             rexAMode_M(vreg2ireg(i->Ain.SseLdzLO.reg), 
+                        i->Ain.SseLdzLO.addr));
+      *p++ = 0x0F; 
+      *p++ = 0x10; 
+      p = doAMode_M(p, vreg2ireg(i->Ain.SseLdzLO.reg), 
+                       i->Ain.SseLdzLO.addr);
+      goto done;
+
+   case Ain_Sse32Fx4:
+      xtra = 0;
+      *p++ = clearWBit(
+             rexAMode_R( vreg2ireg(i->Ain.Sse32Fx4.dst),
+                         vreg2ireg(i->Ain.Sse32Fx4.src) ));
+      *p++ = 0x0F;
+      switch (i->Ain.Sse32Fx4.op) {
+         case Asse_ADDF:   *p++ = 0x58; break;
+         case Asse_DIVF:   *p++ = 0x5E; break;
+         case Asse_MAXF:   *p++ = 0x5F; break;
+         case Asse_MINF:   *p++ = 0x5D; break;
+         case Asse_MULF:   *p++ = 0x59; break;
+         case Asse_RCPF:   *p++ = 0x53; break;
+         case Asse_RSQRTF: *p++ = 0x52; break;
+         case Asse_SQRTF:  *p++ = 0x51; break;
+         case Asse_SUBF:   *p++ = 0x5C; break;
+         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
+         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
+         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
+         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
+         default: goto bad;
+      }
+      p = doAMode_R(p, vreg2ireg(i->Ain.Sse32Fx4.dst),
+                       vreg2ireg(i->Ain.Sse32Fx4.src) );
+      if (xtra & 0x100)
+         *p++ = toUChar(xtra & 0xFF);
+      goto done;
+
+   case Ain_Sse64Fx2:
+      xtra = 0;
+      *p++ = 0x66;
+      *p++ = clearWBit(
+             rexAMode_R( vreg2ireg(i->Ain.Sse64Fx2.dst),
+                         vreg2ireg(i->Ain.Sse64Fx2.src) ));
+      *p++ = 0x0F;
+      switch (i->Ain.Sse64Fx2.op) {
+         case Asse_ADDF:   *p++ = 0x58; break;
+         case Asse_DIVF:   *p++ = 0x5E; break;
+         case Asse_MAXF:   *p++ = 0x5F; break;
+         case Asse_MINF:   *p++ = 0x5D; break;
+         case Asse_MULF:   *p++ = 0x59; break;
+//..          case Xsse_RCPF:   *p++ = 0x53; break;
+//..          case Xsse_RSQRTF: *p++ = 0x52; break;
+         case Asse_SQRTF:  *p++ = 0x51; break;
+         case Asse_SUBF:   *p++ = 0x5C; break;
+         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
+         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
+         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
+         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
+         default: goto bad;
+      }
+      p = doAMode_R(p, vreg2ireg(i->Ain.Sse64Fx2.dst),
+                       vreg2ireg(i->Ain.Sse64Fx2.src) );
+      if (xtra & 0x100)
+         *p++ = toUChar(xtra & 0xFF);
+      goto done;
+
+   case Ain_Sse32FLo:
+      xtra = 0;
+      *p++ = 0xF3;
+      *p++ = clearWBit(
+             rexAMode_R( vreg2ireg(i->Ain.Sse32FLo.dst),
+                         vreg2ireg(i->Ain.Sse32FLo.src) ));
+      *p++ = 0x0F;
+      switch (i->Ain.Sse32FLo.op) {
+         case Asse_ADDF:   *p++ = 0x58; break;
+         case Asse_DIVF:   *p++ = 0x5E; break;
+         case Asse_MAXF:   *p++ = 0x5F; break;
+         case Asse_MINF:   *p++ = 0x5D; break;
+         case Asse_MULF:   *p++ = 0x59; break;
+         case Asse_RCPF:   *p++ = 0x53; break;
+         case Asse_RSQRTF: *p++ = 0x52; break;
+         case Asse_SQRTF:  *p++ = 0x51; break;
+         case Asse_SUBF:   *p++ = 0x5C; break;
+         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
+         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
+         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
+         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
+         default: goto bad;
+      }
+      p = doAMode_R(p, vreg2ireg(i->Ain.Sse32FLo.dst),
+                       vreg2ireg(i->Ain.Sse32FLo.src) );
+      if (xtra & 0x100)
+         *p++ = toUChar(xtra & 0xFF);
+      goto done;
+
+   case Ain_Sse64FLo:
+      xtra = 0;
+      *p++ = 0xF2;
+      *p++ = clearWBit(
+             rexAMode_R( vreg2ireg(i->Ain.Sse64FLo.dst),
+                         vreg2ireg(i->Ain.Sse64FLo.src) ));
+      *p++ = 0x0F;
+      switch (i->Ain.Sse64FLo.op) {
+         case Asse_ADDF:   *p++ = 0x58; break;
+         case Asse_DIVF:   *p++ = 0x5E; break;
+         case Asse_MAXF:   *p++ = 0x5F; break;
+         case Asse_MINF:   *p++ = 0x5D; break;
+         case Asse_MULF:   *p++ = 0x59; break;
+//..          case Xsse_RCPF:   *p++ = 0x53; break;
+//..          case Xsse_RSQRTF: *p++ = 0x52; break;
+         case Asse_SQRTF:  *p++ = 0x51; break;
+         case Asse_SUBF:   *p++ = 0x5C; break;
+         case Asse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
+         case Asse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
+         case Asse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
+         case Asse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
+         default: goto bad;
+      }
+      p = doAMode_R(p, vreg2ireg(i->Ain.Sse64FLo.dst),
+                       vreg2ireg(i->Ain.Sse64FLo.src) );
+      if (xtra & 0x100)
+         *p++ = toUChar(xtra & 0xFF);
+      goto done;
+
+   case Ain_SseReRg:
+#     define XX(_n) *p++ = (_n)
+
+      rex = clearWBit(
+            rexAMode_R( vreg2ireg(i->Ain.SseReRg.dst),
+                        vreg2ireg(i->Ain.SseReRg.src) ));
+
+      switch (i->Ain.SseReRg.op) {
+         case Asse_MOV:     /*movups*/ XX(rex); XX(0x0F); XX(0x10); break;
+         case Asse_OR:                 XX(rex); XX(0x0F); XX(0x56); break;
+         case Asse_XOR:                XX(rex); XX(0x0F); XX(0x57); break;
+         case Asse_AND:                XX(rex); XX(0x0F); XX(0x54); break;
+         case Asse_ANDN:               XX(rex); XX(0x0F); XX(0x55); break;
+         case Asse_PACKSSD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6B); break;
+         case Asse_PACKSSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x63); break;
+         case Asse_PACKUSW:  XX(0x66); XX(rex); XX(0x0F); XX(0x67); break;
+         case Asse_ADD8:     XX(0x66); XX(rex); XX(0x0F); XX(0xFC); break;
+         case Asse_ADD16:    XX(0x66); XX(rex); XX(0x0F); XX(0xFD); break;
+         case Asse_ADD32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFE); break;
+         case Asse_ADD64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD4); break;
+         case Asse_QADD8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEC); break;
+         case Asse_QADD16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xED); break;
+         case Asse_QADD8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xDC); break;
+         case Asse_QADD16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xDD); break;
+         case Asse_AVG8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xE0); break;
+         case Asse_AVG16U:   XX(0x66); XX(rex); XX(0x0F); XX(0xE3); break;
+         case Asse_CMPEQ8:   XX(0x66); XX(rex); XX(0x0F); XX(0x74); break;
+         case Asse_CMPEQ16:  XX(0x66); XX(rex); XX(0x0F); XX(0x75); break;
+         case Asse_CMPEQ32:  XX(0x66); XX(rex); XX(0x0F); XX(0x76); break;
+         case Asse_CMPGT8S:  XX(0x66); XX(rex); XX(0x0F); XX(0x64); break;
+         case Asse_CMPGT16S: XX(0x66); XX(rex); XX(0x0F); XX(0x65); break;
+         case Asse_CMPGT32S: XX(0x66); XX(rex); XX(0x0F); XX(0x66); break;
+         case Asse_MAX16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEE); break;
+         case Asse_MAX8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDE); break;
+         case Asse_MIN16S:   XX(0x66); XX(rex); XX(0x0F); XX(0xEA); break;
+         case Asse_MIN8U:    XX(0x66); XX(rex); XX(0x0F); XX(0xDA); break;
+         case Asse_MULHI16U: XX(0x66); XX(rex); XX(0x0F); XX(0xE4); break;
+         case Asse_MULHI16S: XX(0x66); XX(rex); XX(0x0F); XX(0xE5); break;
+         case Asse_MUL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD5); break;
+         case Asse_SHL16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF1); break;
+         case Asse_SHL32:    XX(0x66); XX(rex); XX(0x0F); XX(0xF2); break;
+         case Asse_SHL64:    XX(0x66); XX(rex); XX(0x0F); XX(0xF3); break;
+         case Asse_SAR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xE1); break;
+         case Asse_SAR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xE2); break;
+         case Asse_SHR16:    XX(0x66); XX(rex); XX(0x0F); XX(0xD1); break;
+         case Asse_SHR32:    XX(0x66); XX(rex); XX(0x0F); XX(0xD2); break;
+         case Asse_SHR64:    XX(0x66); XX(rex); XX(0x0F); XX(0xD3); break;
+         case Asse_SUB8:     XX(0x66); XX(rex); XX(0x0F); XX(0xF8); break;
+         case Asse_SUB16:    XX(0x66); XX(rex); XX(0x0F); XX(0xF9); break;
+         case Asse_SUB32:    XX(0x66); XX(rex); XX(0x0F); XX(0xFA); break;
+         case Asse_SUB64:    XX(0x66); XX(rex); XX(0x0F); XX(0xFB); break;
+         case Asse_QSUB8S:   XX(0x66); XX(rex); XX(0x0F); XX(0xE8); break;
+         case Asse_QSUB16S:  XX(0x66); XX(rex); XX(0x0F); XX(0xE9); break;
+         case Asse_QSUB8U:   XX(0x66); XX(rex); XX(0x0F); XX(0xD8); break;
+         case Asse_QSUB16U:  XX(0x66); XX(rex); XX(0x0F); XX(0xD9); break;
+         case Asse_UNPCKHB:  XX(0x66); XX(rex); XX(0x0F); XX(0x68); break;
+         case Asse_UNPCKHW:  XX(0x66); XX(rex); XX(0x0F); XX(0x69); break;
+         case Asse_UNPCKHD:  XX(0x66); XX(rex); XX(0x0F); XX(0x6A); break;
+         case Asse_UNPCKHQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6D); break;
+         case Asse_UNPCKLB:  XX(0x66); XX(rex); XX(0x0F); XX(0x60); break;
+         case Asse_UNPCKLW:  XX(0x66); XX(rex); XX(0x0F); XX(0x61); break;
+         case Asse_UNPCKLD:  XX(0x66); XX(rex); XX(0x0F); XX(0x62); break;
+         case Asse_UNPCKLQ:  XX(0x66); XX(rex); XX(0x0F); XX(0x6C); break;
+         default: goto bad;
+      }
+      p = doAMode_R(p, vreg2ireg(i->Ain.SseReRg.dst),
+                       vreg2ireg(i->Ain.SseReRg.src) );
+#     undef XX
+      goto done;
+
+   case Ain_SseCMov:
+      /* jmp fwds if !condition */
+      *p++ = toUChar(0x70 + (i->Ain.SseCMov.cond ^ 1));
+      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
+      ptmp = p;
+
+      /* movaps %src, %dst */
+      *p++ = clearWBit(
+             rexAMode_R( vreg2ireg(i->Ain.SseCMov.dst),
+                         vreg2ireg(i->Ain.SseCMov.src) ));
+      *p++ = 0x0F; 
+      *p++ = 0x28; 
+      p = doAMode_R(p, vreg2ireg(i->Ain.SseCMov.dst),
+                       vreg2ireg(i->Ain.SseCMov.src) );
+
+      /* Fill in the jump offset. */
+      *(ptmp-1) = toUChar(p - ptmp);
+      goto done;
+
+   case Ain_SseShuf:
+      *p++ = 0x66; 
+      *p++ = clearWBit(
+             rexAMode_R( vreg2ireg(i->Ain.SseShuf.dst),
+                         vreg2ireg(i->Ain.SseShuf.src) ));
+      *p++ = 0x0F; 
+      *p++ = 0x70; 
+      p = doAMode_R(p, vreg2ireg(i->Ain.SseShuf.dst),
+                       vreg2ireg(i->Ain.SseShuf.src) );
+      *p++ = (UChar)(i->Ain.SseShuf.order);
+      goto done;
+
+   default: 
+      goto bad;
+   }
+
+  bad:
+   ppAMD64Instr(i, mode64);
+   vpanic("emit_AMD64Instr");
+   /*NOTREACHED*/
+   
+  done:
+   vassert(p - &buf[0] <= 32);
+   return p - &buf[0];
+
+#  undef fake
+}
+
+/*---------------------------------------------------------------*/
+/*--- end                                   host_amd64_defs.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_amd64_defs.h b/VEX/priv/host_amd64_defs.h
new file mode 100644
index 0000000..cf19bac
--- /dev/null
+++ b/VEX/priv/host_amd64_defs.h

@@ -0,0 +1,753 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                 host_amd64_defs.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#ifndef __VEX_HOST_AMD64_DEFS_H
+#define __VEX_HOST_AMD64_DEFS_H
+
+
+/* --------- Registers. --------- */
+
+/* The usual HReg abstraction.  There are 16 real int regs, 6 real
+   float regs, and 16 real vector regs.
+*/
+
+extern void ppHRegAMD64 ( HReg );
+
+extern HReg hregAMD64_RAX ( void );
+extern HReg hregAMD64_RBX ( void );
+extern HReg hregAMD64_RCX ( void );
+extern HReg hregAMD64_RDX ( void );
+extern HReg hregAMD64_RSP ( void );
+extern HReg hregAMD64_RBP ( void );
+extern HReg hregAMD64_RSI ( void );
+extern HReg hregAMD64_RDI ( void );
+extern HReg hregAMD64_R8  ( void );
+extern HReg hregAMD64_R9  ( void );
+extern HReg hregAMD64_R10 ( void );
+extern HReg hregAMD64_R11 ( void );
+extern HReg hregAMD64_R12 ( void );
+extern HReg hregAMD64_R13 ( void );
+extern HReg hregAMD64_R14 ( void );
+extern HReg hregAMD64_R15 ( void );
+
+extern HReg hregAMD64_FAKE0 ( void );
+extern HReg hregAMD64_FAKE1 ( void );
+extern HReg hregAMD64_FAKE2 ( void );
+extern HReg hregAMD64_FAKE3 ( void );
+extern HReg hregAMD64_FAKE4 ( void );
+extern HReg hregAMD64_FAKE5 ( void );
+
+extern HReg hregAMD64_XMM0  ( void );
+extern HReg hregAMD64_XMM1  ( void );
+extern HReg hregAMD64_XMM2  ( void );
+extern HReg hregAMD64_XMM3  ( void );
+extern HReg hregAMD64_XMM4  ( void );
+extern HReg hregAMD64_XMM5  ( void );
+extern HReg hregAMD64_XMM6  ( void );
+extern HReg hregAMD64_XMM7  ( void );
+extern HReg hregAMD64_XMM8  ( void );
+extern HReg hregAMD64_XMM9  ( void );
+extern HReg hregAMD64_XMM10 ( void );
+extern HReg hregAMD64_XMM11 ( void );
+extern HReg hregAMD64_XMM12 ( void );
+extern HReg hregAMD64_XMM13 ( void );
+extern HReg hregAMD64_XMM14 ( void );
+extern HReg hregAMD64_XMM15 ( void );
+
+
+/* --------- Condition codes, AMD encoding. --------- */
+
+typedef
+   enum {
+      Acc_O      = 0,  /* overflow           */
+      Acc_NO     = 1,  /* no overflow        */
+
+      Acc_B      = 2,  /* below              */
+      Acc_NB     = 3,  /* not below          */
+
+      Acc_Z      = 4,  /* zero               */
+      Acc_NZ     = 5,  /* not zero           */
+
+      Acc_BE     = 6,  /* below or equal     */
+      Acc_NBE    = 7,  /* not below or equal */
+
+      Acc_S      = 8,  /* negative           */
+      Acc_NS     = 9,  /* not negative       */
+
+      Acc_P      = 10, /* parity even        */
+      Acc_NP     = 11, /* not parity even    */
+
+      Acc_L      = 12, /* jump less          */
+      Acc_NL     = 13, /* not less           */
+
+      Acc_LE     = 14, /* less or equal      */
+      Acc_NLE    = 15, /* not less or equal  */
+
+      Acc_ALWAYS = 16  /* the usual hack     */
+   }
+   AMD64CondCode;
+
+extern HChar* showAMD64CondCode ( AMD64CondCode );
+
+
+/* --------- Memory address expressions (amodes). --------- */
+
+typedef
+   enum {
+     Aam_IR,        /* Immediate + Reg */
+     Aam_IRRS       /* Immediate + Reg1 + (Reg2 << Shift) */
+   }
+   AMD64AModeTag;
+
+typedef
+   struct {
+      AMD64AModeTag tag;
+      union {
+         struct {
+            UInt imm;
+            HReg reg;
+         } IR;
+         struct {
+            UInt imm;
+            HReg base;
+            HReg index;
+            Int  shift; /* 0, 1, 2 or 3 only */
+         } IRRS;
+      } Aam;
+   }
+   AMD64AMode;
+
+extern AMD64AMode* AMD64AMode_IR   ( UInt, HReg );
+extern AMD64AMode* AMD64AMode_IRRS ( UInt, HReg, HReg, Int );
+
+extern AMD64AMode* dopyAMD64AMode ( AMD64AMode* );
+
+extern void ppAMD64AMode ( AMD64AMode* );
+
+
+/* --------- Operand, which can be reg, immediate or memory. --------- */
+
+typedef 
+   enum {
+      Armi_Imm,
+      Armi_Reg,
+      Armi_Mem
+   }
+   AMD64RMITag;
+
+typedef
+   struct {
+      AMD64RMITag tag;
+      union {
+         struct {
+            UInt imm32;
+         } Imm;
+         struct {
+            HReg reg;
+         } Reg;
+         struct {
+            AMD64AMode* am;
+         } Mem;
+      }
+      Armi;
+   }
+   AMD64RMI;
+
+extern AMD64RMI* AMD64RMI_Imm ( UInt );
+extern AMD64RMI* AMD64RMI_Reg ( HReg );
+extern AMD64RMI* AMD64RMI_Mem ( AMD64AMode* );
+
+extern void ppAMD64RMI ( AMD64RMI* );
+
+
+/* --------- Operand, which can be reg or immediate only. --------- */
+
+typedef 
+   enum {
+      Ari_Imm,
+      Ari_Reg
+   }
+   AMD64RITag;
+
+typedef
+   struct {
+      AMD64RITag tag;
+      union {
+         struct {
+            UInt imm32;
+         } Imm;
+         struct {
+            HReg reg;
+         } Reg;
+      }
+      Ari;
+   }
+   AMD64RI;
+
+extern AMD64RI* AMD64RI_Imm ( UInt );
+extern AMD64RI* AMD64RI_Reg ( HReg );
+
+extern void ppAMD64RI ( AMD64RI* );
+
+
+/* --------- Operand, which can be reg or memory only. --------- */
+
+typedef 
+   enum {
+      Arm_Reg,
+      Arm_Mem
+   }
+   AMD64RMTag;
+
+typedef
+   struct {
+      AMD64RMTag tag;
+      union {
+         struct {
+            HReg reg;
+         } Reg;
+         struct {
+            AMD64AMode* am;
+         } Mem;
+      }
+      Arm;
+   }
+   AMD64RM;
+
+extern AMD64RM* AMD64RM_Reg ( HReg );
+extern AMD64RM* AMD64RM_Mem ( AMD64AMode* );
+
+extern void ppAMD64RM ( AMD64RM* );
+
+
+/* --------- Instructions. --------- */
+
+/* --------- */
+typedef
+   enum {
+      Aun_NEG,
+      Aun_NOT
+   }
+   AMD64UnaryOp;
+
+extern HChar* showAMD64UnaryOp ( AMD64UnaryOp );
+
+
+/* --------- */
+typedef 
+   enum {
+      Aalu_INVALID,
+      Aalu_MOV,
+      Aalu_CMP,
+      Aalu_ADD, Aalu_SUB, Aalu_ADC, Aalu_SBB, 
+      Aalu_AND, Aalu_OR, Aalu_XOR,
+      Aalu_MUL
+   }
+   AMD64AluOp;
+
+extern HChar* showAMD64AluOp ( AMD64AluOp );
+
+
+/* --------- */
+typedef
+   enum {
+      Ash_INVALID,
+      Ash_SHL, Ash_SHR, Ash_SAR
+   }
+   AMD64ShiftOp;
+
+extern HChar* showAMD64ShiftOp ( AMD64ShiftOp );
+
+
+/* --------- */
+typedef
+   enum {
+      Afp_INVALID,
+      /* Binary */
+      Afp_SCALE, Afp_ATAN, Afp_YL2X, Afp_YL2XP1, Afp_PREM, Afp_PREM1,
+      /* Unary */
+      Afp_SQRT,
+      Afp_SIN, Afp_COS, Afp_TAN,
+      Afp_ROUND, Afp_2XM1
+   }
+   A87FpOp;
+
+extern HChar* showA87FpOp ( A87FpOp );
+
+
+/* --------- */
+typedef
+   enum {
+      Asse_INVALID,
+      /* mov */
+      Asse_MOV,
+      /* Floating point binary */
+      Asse_ADDF, Asse_SUBF, Asse_MULF, Asse_DIVF,
+      Asse_MAXF, Asse_MINF,
+      Asse_CMPEQF, Asse_CMPLTF, Asse_CMPLEF, Asse_CMPUNF,
+      /* Floating point unary */
+      Asse_RCPF, Asse_RSQRTF, Asse_SQRTF, 
+      /* Bitwise */
+      Asse_AND, Asse_OR, Asse_XOR, Asse_ANDN,
+      Asse_ADD8, Asse_ADD16, Asse_ADD32, Asse_ADD64,
+      Asse_QADD8U, Asse_QADD16U,
+      Asse_QADD8S, Asse_QADD16S,
+      Asse_SUB8, Asse_SUB16, Asse_SUB32, Asse_SUB64,
+      Asse_QSUB8U, Asse_QSUB16U,
+      Asse_QSUB8S, Asse_QSUB16S,
+      Asse_MUL16,
+      Asse_MULHI16U,
+      Asse_MULHI16S,
+      Asse_AVG8U, Asse_AVG16U,
+      Asse_MAX16S,
+      Asse_MAX8U,
+      Asse_MIN16S,
+      Asse_MIN8U,
+      Asse_CMPEQ8, Asse_CMPEQ16, Asse_CMPEQ32,
+      Asse_CMPGT8S, Asse_CMPGT16S, Asse_CMPGT32S,
+      Asse_SHL16, Asse_SHL32, Asse_SHL64,
+      Asse_SHR16, Asse_SHR32, Asse_SHR64,
+      Asse_SAR16, Asse_SAR32, 
+      Asse_PACKSSD, Asse_PACKSSW, Asse_PACKUSW,
+      Asse_UNPCKHB, Asse_UNPCKHW, Asse_UNPCKHD, Asse_UNPCKHQ,
+      Asse_UNPCKLB, Asse_UNPCKLW, Asse_UNPCKLD, Asse_UNPCKLQ
+   }
+   AMD64SseOp;
+
+extern HChar* showAMD64SseOp ( AMD64SseOp );
+
+
+/* --------- */
+typedef
+   enum {
+      Ain_Imm64,       /* Generate 64-bit literal to register */
+      Ain_Alu64R,      /* 64-bit mov/arith/logical, dst=REG */
+      Ain_Alu64M,      /* 64-bit mov/arith/logical, dst=MEM */
+      Ain_Sh64,        /* 64-bit shift/rotate, dst=REG or MEM */
+      Ain_Test64,      /* 64-bit test (AND, set flags, discard result) */
+      Ain_Unary64,     /* 64-bit not and neg */
+      Ain_Lea64,       /* 64-bit compute EA into a reg */
+      Ain_MulL,        /* widening multiply */
+      Ain_Div,         /* div and mod */
+//..       Xin_Sh3232,    /* shldl or shrdl */
+      Ain_Push,        /* push 64-bit value on stack */
+      Ain_Call,        /* call to address in register */
+      Ain_Goto,        /* conditional/unconditional jmp to dst */
+      Ain_CMov64,      /* conditional move */
+      Ain_MovxLQ,      /* reg-reg move, zx-ing/sx-ing top half */
+      Ain_LoadEX,      /* mov{s,z}{b,w,l}q from mem to reg */
+      Ain_Store,       /* store 32/16/8 bit value in memory */
+      Ain_Set64,       /* convert condition code to 64-bit value */
+      Ain_Bsfr64,      /* 64-bit bsf/bsr */
+      Ain_MFence,      /* mem fence */
+      Ain_ACAS,        /* 8/16/32/64-bit lock;cmpxchg */
+      Ain_DACAS,       /* lock;cmpxchg8b/16b (doubleword ACAS, 2 x
+                          32-bit or 2 x 64-bit only) */
+
+      Ain_A87Free,     /* free up x87 registers */
+      Ain_A87PushPop,  /* x87 loads/stores */
+      Ain_A87FpOp,     /* x87 operations */
+      Ain_A87LdCW,     /* load x87 control word */
+      Ain_A87StSW,     /* store x87 status word */
+//.. 
+//..       Xin_FpUnary,   /* FP fake unary op */
+//..       Xin_FpBinary,  /* FP fake binary op */
+//..       Xin_FpLdSt,    /* FP fake load/store */
+//..       Xin_FpLdStI,   /* FP fake load/store, converting to/from Int */
+//..       Xin_Fp64to32,  /* FP round IEEE754 double to IEEE754 single */
+//..       Xin_FpCMov,    /* FP fake floating point conditional move */
+      Ain_LdMXCSR,     /* load %mxcsr */
+//..       Xin_FpStSW_AX, /* fstsw %ax */
+      Ain_SseUComIS,   /* ucomisd/ucomiss, then get %rflags into int
+                          register */
+      Ain_SseSI2SF,    /* scalar 32/64 int to 32/64 float conversion */
+      Ain_SseSF2SI,    /* scalar 32/64 float to 32/64 int conversion */
+      Ain_SseSDSS,     /* scalar float32 to/from float64 */
+//.. 
+//..       Xin_SseConst,  /* Generate restricted SSE literal */
+      Ain_SseLdSt,     /* SSE load/store 32/64/128 bits, no alignment
+                          constraints, upper 96/64/0 bits arbitrary */
+      Ain_SseLdzLO,    /* SSE load low 32/64 bits, zero remainder of reg */
+      Ain_Sse32Fx4,    /* SSE binary, 32Fx4 */
+      Ain_Sse32FLo,    /* SSE binary, 32F in lowest lane only */
+      Ain_Sse64Fx2,    /* SSE binary, 64Fx2 */
+      Ain_Sse64FLo,    /* SSE binary, 64F in lowest lane only */
+      Ain_SseReRg,     /* SSE binary general reg-reg, Re, Rg */
+      Ain_SseCMov,     /* SSE conditional move */
+      Ain_SseShuf      /* SSE2 shuffle (pshufd) */
+   }
+   AMD64InstrTag;
+
+/* Destinations are on the RIGHT (second operand) */
+
+typedef
+   struct {
+      AMD64InstrTag tag;
+      union {
+         struct {
+            ULong imm64;
+            HReg  dst;
+         } Imm64;
+         struct {
+            AMD64AluOp op;
+            AMD64RMI*  src;
+            HReg       dst;
+         } Alu64R;
+         struct {
+            AMD64AluOp  op;
+            AMD64RI*    src;
+            AMD64AMode* dst;
+         } Alu64M;
+         struct {
+            AMD64ShiftOp op;
+            UInt         src;  /* shift amount, or 0 means %cl */
+            HReg         dst;
+         } Sh64;
+         struct {
+            UInt   imm32;
+            HReg   dst;
+         } Test64;
+         /* Not and Neg */
+         struct {
+            AMD64UnaryOp op;
+            HReg         dst;
+         } Unary64;
+         /* 64-bit compute EA into a reg */
+         struct {
+            AMD64AMode* am;
+            HReg        dst;
+         } Lea64;
+         /* 64 x 64 -> 128 bit widening multiply: RDX:RAX = RAX *s/u
+            r/m64 */
+         struct {
+            Bool     syned;
+            AMD64RM* src;
+         } MulL;
+          /* amd64 div/idiv instruction.  Modifies RDX and RAX and
+	     reads src. */
+         struct {
+            Bool     syned;
+            Int      sz; /* 4 or 8 only */
+            AMD64RM* src;
+         } Div;
+//..          /* shld/shrd.  op may only be Xsh_SHL or Xsh_SHR */
+//..          struct {
+//..             X86ShiftOp op;
+//..             UInt       amt;   /* shift amount, or 0 means %cl */
+//..             HReg       src;
+//..             HReg       dst;
+//..          } Sh3232;
+         struct {
+            AMD64RMI* src;
+         } Push;
+         /* Pseudo-insn.  Call target (an absolute address), on given
+            condition (which could be Xcc_ALWAYS). */
+         struct {
+            AMD64CondCode cond;
+            Addr64        target;
+            Int           regparms; /* 0 .. 6 */
+         } Call;
+         /* Pseudo-insn.  Goto dst, on given condition (which could be
+            Acc_ALWAYS). */
+         struct {
+            IRJumpKind    jk;
+            AMD64CondCode cond;
+            AMD64RI*      dst;
+         } Goto;
+         /* Mov src to dst on the given condition, which may not
+            be the bogus Acc_ALWAYS. */
+         struct {
+            AMD64CondCode cond;
+            AMD64RM*      src;
+            HReg          dst;
+         } CMov64;
+         /* reg-reg move, sx-ing/zx-ing top half */
+         struct {
+            Bool syned;
+            HReg src;
+            HReg dst;
+         } MovxLQ;
+         /* Sign/Zero extending loads.  Dst size is always 64 bits. */
+         struct {
+            UChar       szSmall; /* only 1, 2 or 4 */
+            Bool        syned;
+            AMD64AMode* src;
+            HReg        dst;
+         } LoadEX;
+         /* 32/16/8 bit stores. */
+         struct {
+            UChar       sz; /* only 1, 2 or 4 */
+            HReg        src;
+            AMD64AMode* dst;
+         } Store;
+         /* Convert an amd64 condition code to a 64-bit value (0 or 1). */
+         struct {
+            AMD64CondCode cond;
+            HReg          dst;
+         } Set64;
+         /* 64-bit bsf or bsr. */
+         struct {
+            Bool isFwds;
+            HReg src;
+            HReg dst;
+         } Bsfr64;
+         /* Mem fence.  In short, an insn which flushes all preceding
+            loads and stores as much as possible before continuing.
+            On AMD64 we emit a real "mfence". */
+         struct {
+         } MFence;
+         struct {
+            AMD64AMode* addr;
+            UChar       sz; /* 1, 2, 4 or 8 */
+         } ACAS;
+         struct {
+            AMD64AMode* addr;
+            UChar       sz; /* 4 or 8 only */
+         } DACAS;
+
+         /* --- X87 --- */
+
+         /* A very minimal set of x87 insns, that operate exactly in a
+            stack-like way so no need to think about x87 registers. */
+
+         /* Do 'ffree' on %st(7) .. %st(7-nregs) */
+         struct {
+            Int nregs; /* 1 <= nregs <= 7 */
+         } A87Free;
+
+         /* Push a 32- or 64-bit FP value from memory onto the stack,
+            or move a value from the stack to memory and remove it
+            from the stack. */
+         struct {
+            AMD64AMode* addr;
+            Bool        isPush;
+            UChar       szB; /* 4 or 8 */
+         } A87PushPop;
+
+         /* Do an operation on the top-of-stack.  This can be unary, in
+            which case it is %st0 = OP( %st0 ), or binary: %st0 = OP(
+            %st0, %st1 ). */
+         struct {
+            A87FpOp op;
+         } A87FpOp;
+
+         /* Load the FPU control word. */
+         struct {
+            AMD64AMode* addr;
+         } A87LdCW;
+
+         /* Store the FPU status word (fstsw m16) */
+         struct {
+            AMD64AMode* addr;
+         } A87StSW;
+
+         /* --- SSE --- */
+
+         /* Load 32 bits into %mxcsr. */
+         struct {
+            AMD64AMode* addr;
+         }
+         LdMXCSR;
+//..          /* fstsw %ax */
+//..          struct {
+//..             /* no fields */
+//..          }
+//..          FpStSW_AX;
+         /* ucomisd/ucomiss, then get %rflags into int register */
+         struct {
+            UChar   sz;   /* 4 or 8 only */
+            HReg    srcL; /* xmm */
+            HReg    srcR; /* xmm */
+            HReg    dst;  /* int */
+         } SseUComIS;
+         /* scalar 32/64 int to 32/64 float conversion */
+         struct {
+            UChar szS; /* 4 or 8 */
+            UChar szD; /* 4 or 8 */
+            HReg  src; /* i class */
+            HReg  dst; /* v class */
+         } SseSI2SF;
+         /* scalar 32/64 float to 32/64 int conversion */
+         struct {
+            UChar szS; /* 4 or 8 */
+            UChar szD; /* 4 or 8 */
+            HReg  src; /* v class */
+            HReg  dst; /* i class */
+         } SseSF2SI;
+         /* scalar float32 to/from float64 */
+         struct {
+            Bool from64; /* True: 64->32; False: 32->64 */
+            HReg src;
+            HReg dst;
+         } SseSDSS;
+//.. 
+//..          /* Simplistic SSE[123] */
+//..          struct {
+//..             UShort  con;
+//..             HReg    dst;
+//..          } SseConst;
+         struct {
+            Bool        isLoad;
+            UChar       sz; /* 4, 8 or 16 only */
+            HReg        reg;
+            AMD64AMode* addr;
+         } SseLdSt;
+         struct {
+            Int         sz; /* 4 or 8 only */
+            HReg        reg;
+            AMD64AMode* addr;
+         } SseLdzLO;
+         struct {
+            AMD64SseOp op;
+            HReg       src;
+            HReg       dst;
+         } Sse32Fx4;
+         struct {
+            AMD64SseOp op;
+            HReg       src;
+            HReg       dst;
+         } Sse32FLo;
+         struct {
+            AMD64SseOp op;
+            HReg       src;
+            HReg       dst;
+         } Sse64Fx2;
+         struct {
+            AMD64SseOp op;
+            HReg       src;
+            HReg       dst;
+         } Sse64FLo;
+         struct {
+            AMD64SseOp op;
+            HReg       src;
+            HReg       dst;
+         } SseReRg;
+         /* Mov src to dst on the given condition, which may not
+            be the bogus Xcc_ALWAYS. */
+         struct {
+            AMD64CondCode cond;
+            HReg          src;
+            HReg          dst;
+         } SseCMov;
+         struct {
+            Int    order; /* 0 <= order <= 0xFF */
+            HReg   src;
+            HReg   dst;
+         } SseShuf;
+
+      } Ain;
+   }
+   AMD64Instr;
+
+extern AMD64Instr* AMD64Instr_Imm64      ( ULong imm64, HReg dst );
+extern AMD64Instr* AMD64Instr_Alu64R     ( AMD64AluOp, AMD64RMI*, HReg );
+extern AMD64Instr* AMD64Instr_Alu64M     ( AMD64AluOp, AMD64RI*,  AMD64AMode* );
+extern AMD64Instr* AMD64Instr_Unary64    ( AMD64UnaryOp op, HReg dst );
+extern AMD64Instr* AMD64Instr_Lea64      ( AMD64AMode* am, HReg dst );
+extern AMD64Instr* AMD64Instr_Sh64       ( AMD64ShiftOp, UInt, HReg );
+extern AMD64Instr* AMD64Instr_Test64     ( UInt imm32, HReg dst );
+extern AMD64Instr* AMD64Instr_MulL       ( Bool syned, AMD64RM* );
+extern AMD64Instr* AMD64Instr_Div        ( Bool syned, Int sz, AMD64RM* );
+//.. extern AMD64Instr* AMD64Instr_Sh3232    ( AMD64ShiftOp, UInt amt, HReg src, HReg dst );
+extern AMD64Instr* AMD64Instr_Push       ( AMD64RMI* );
+extern AMD64Instr* AMD64Instr_Call       ( AMD64CondCode, Addr64, Int );
+extern AMD64Instr* AMD64Instr_Goto       ( IRJumpKind, AMD64CondCode cond, AMD64RI* dst );
+extern AMD64Instr* AMD64Instr_CMov64     ( AMD64CondCode, AMD64RM* src, HReg dst );
+extern AMD64Instr* AMD64Instr_MovxLQ     ( Bool syned, HReg src, HReg dst );
+extern AMD64Instr* AMD64Instr_LoadEX     ( UChar szSmall, Bool syned,
+                                           AMD64AMode* src, HReg dst );
+extern AMD64Instr* AMD64Instr_Store      ( UChar sz, HReg src, AMD64AMode* dst );
+extern AMD64Instr* AMD64Instr_Set64      ( AMD64CondCode cond, HReg dst );
+extern AMD64Instr* AMD64Instr_Bsfr64     ( Bool isFwds, HReg src, HReg dst );
+extern AMD64Instr* AMD64Instr_MFence     ( void );
+extern AMD64Instr* AMD64Instr_ACAS       ( AMD64AMode* addr, UChar sz );
+extern AMD64Instr* AMD64Instr_DACAS      ( AMD64AMode* addr, UChar sz );
+
+extern AMD64Instr* AMD64Instr_A87Free    ( Int nregs );
+extern AMD64Instr* AMD64Instr_A87PushPop ( AMD64AMode* addr, Bool isPush, UChar szB );
+extern AMD64Instr* AMD64Instr_A87FpOp    ( A87FpOp op );
+extern AMD64Instr* AMD64Instr_A87LdCW    ( AMD64AMode* addr );
+extern AMD64Instr* AMD64Instr_A87StSW    ( AMD64AMode* addr );
+//.. 
+//.. extern AMD64Instr* AMD64Instr_FpUnary   ( AMD64FpOp op, HReg src, HReg dst );
+//.. extern AMD64Instr* AMD64Instr_FpBinary  ( AMD64FpOp op, HReg srcL, HReg srcR, HReg dst );
+//.. extern AMD64Instr* AMD64Instr_FpLdSt    ( Bool isLoad, UChar sz, HReg reg, AMD64AMode* );
+//.. extern AMD64Instr* AMD64Instr_FpLdStI   ( Bool isLoad, UChar sz, HReg reg, AMD64AMode* );
+//.. extern AMD64Instr* AMD64Instr_Fp64to32  ( HReg src, HReg dst );
+//.. extern AMD64Instr* AMD64Instr_FpCMov    ( AMD64CondCode, HReg src, HReg dst );
+extern AMD64Instr* AMD64Instr_LdMXCSR    ( AMD64AMode* );
+//.. extern AMD64Instr* AMD64Instr_FpStSW_AX ( void );
+extern AMD64Instr* AMD64Instr_SseUComIS  ( Int sz, HReg srcL, HReg srcR, HReg dst );
+extern AMD64Instr* AMD64Instr_SseSI2SF   ( Int szS, Int szD, HReg src, HReg dst );
+extern AMD64Instr* AMD64Instr_SseSF2SI   ( Int szS, Int szD, HReg src, HReg dst );
+extern AMD64Instr* AMD64Instr_SseSDSS    ( Bool from64, HReg src, HReg dst );
+//.. 
+//.. extern AMD64Instr* AMD64Instr_SseConst  ( UShort con, HReg dst );
+extern AMD64Instr* AMD64Instr_SseLdSt    ( Bool isLoad, Int sz, HReg, AMD64AMode* );
+extern AMD64Instr* AMD64Instr_SseLdzLO   ( Int sz, HReg, AMD64AMode* );
+extern AMD64Instr* AMD64Instr_Sse32Fx4   ( AMD64SseOp, HReg, HReg );
+extern AMD64Instr* AMD64Instr_Sse32FLo   ( AMD64SseOp, HReg, HReg );
+extern AMD64Instr* AMD64Instr_Sse64Fx2   ( AMD64SseOp, HReg, HReg );
+extern AMD64Instr* AMD64Instr_Sse64FLo   ( AMD64SseOp, HReg, HReg );
+extern AMD64Instr* AMD64Instr_SseReRg    ( AMD64SseOp, HReg, HReg );
+extern AMD64Instr* AMD64Instr_SseCMov    ( AMD64CondCode, HReg src, HReg dst );
+extern AMD64Instr* AMD64Instr_SseShuf    ( Int order, HReg src, HReg dst );
+
+
+extern void ppAMD64Instr ( AMD64Instr*, Bool );
+
+/* Some functions that insulate the register allocator from details
+   of the underlying instruction set. */
+extern void         getRegUsage_AMD64Instr ( HRegUsage*, AMD64Instr*, Bool );
+extern void         mapRegs_AMD64Instr     ( HRegRemap*, AMD64Instr*, Bool );
+extern Bool         isMove_AMD64Instr      ( AMD64Instr*, HReg*, HReg* );
+extern Int          emit_AMD64Instr        ( UChar* buf, Int nbuf, AMD64Instr*, 
+                                             Bool, void* dispatch );
+
+extern void genSpill_AMD64  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                              HReg rreg, Int offset, Bool );
+extern void genReload_AMD64 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                              HReg rreg, Int offset, Bool );
+
+extern void         getAllocableRegs_AMD64 ( Int*, HReg** );
+extern HInstrArray* iselSB_AMD64           ( IRSB*, VexArch,
+                                                    VexArchInfo*,
+                                                    VexAbiInfo* );
+
+#endif /* ndef __VEX_HOST_AMD64_DEFS_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                   host_amd64_defs.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_amd64_isel.c b/VEX/priv/host_amd64_isel.c
new file mode 100644
index 0000000..a54444a
--- /dev/null
+++ b/VEX/priv/host_amd64_isel.c

@@ -0,0 +1,4140 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                 host_amd64_isel.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+
+#include "ir_match.h"
+#include "main_util.h"
+#include "main_globals.h"
+#include "host_generic_regs.h"
+#include "host_generic_simd64.h"
+#include "host_generic_simd128.h"
+#include "host_amd64_defs.h"
+
+
+/*---------------------------------------------------------*/
+/*--- x87/SSE control word stuff                        ---*/
+/*---------------------------------------------------------*/
+
+/* Vex-generated code expects to run with the FPU set as follows: all
+   exceptions masked, round-to-nearest, precision = 53 bits.  This
+   corresponds to a FPU control word value of 0x027F.
+
+   Similarly the SSE control word (%mxcsr) should be 0x1F80.
+
+   %fpucw and %mxcsr should have these values on entry to
+   Vex-generated code, and should those values should be
+   unchanged at exit.
+*/
+
+#define DEFAULT_FPUCW 0x027F
+
+#define DEFAULT_MXCSR 0x1F80
+
+/* debugging only, do not use */
+/* define DEFAULT_FPUCW 0x037F */
+
+
+/*---------------------------------------------------------*/
+/*--- misc helpers                                      ---*/
+/*---------------------------------------------------------*/
+
+/* These are duplicated in guest-amd64/toIR.c */
+static IRExpr* unop ( IROp op, IRExpr* a )
+{
+   return IRExpr_Unop(op, a);
+}
+
+static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
+{
+   return IRExpr_Binop(op, a1, a2);
+}
+
+static IRExpr* bind ( Int binder )
+{
+   return IRExpr_Binder(binder);
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISelEnv                                           ---*/
+/*---------------------------------------------------------*/
+
+/* This carries around:
+
+   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
+     might encounter.  This is computed before insn selection starts,
+     and does not change.
+
+   - A mapping from IRTemp to HReg.  This tells the insn selector
+     which virtual register is associated with each IRTemp
+     temporary.  This is computed before insn selection starts, and
+     does not change.  We expect this mapping to map precisely the
+     same set of IRTemps as the type mapping does.
+
+        - vregmap   holds the primary register for the IRTemp.
+        - vregmapHI is only used for 128-bit integer-typed
+             IRTemps.  It holds the identity of a second
+             64-bit virtual HReg, which holds the high half
+             of the value.
+
+   - The code array, that is, the insns selected so far.
+
+   - A counter, for generating new virtual registers.
+
+   - The host subarchitecture we are selecting insns for.  
+     This is set at the start and does not change.
+
+   Note, this is all host-independent.  (JRS 20050201: well, kinda
+   ... not completely.  Compare with ISelEnv for X86.)
+*/
+
+typedef
+   struct {
+      IRTypeEnv*   type_env;
+
+      HReg*        vregmap;
+      HReg*        vregmapHI;
+      Int          n_vregmap;
+
+      HInstrArray* code;
+
+      Int          vreg_ctr;
+
+      UInt         hwcaps;
+   }
+   ISelEnv;
+
+
+static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
+{
+   vassert(tmp >= 0);
+   vassert(tmp < env->n_vregmap);
+   return env->vregmap[tmp];
+}
+
+static void lookupIRTemp128 ( HReg* vrHI, HReg* vrLO, 
+                              ISelEnv* env, IRTemp tmp )
+{
+   vassert(tmp >= 0);
+   vassert(tmp < env->n_vregmap);
+   vassert(env->vregmapHI[tmp] != INVALID_HREG);
+   *vrLO = env->vregmap[tmp];
+   *vrHI = env->vregmapHI[tmp];
+}
+
+static void addInstr ( ISelEnv* env, AMD64Instr* instr )
+{
+   addHInstr(env->code, instr);
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      ppAMD64Instr(instr, True);
+      vex_printf("\n");
+   }
+}
+
+static HReg newVRegI ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcInt64, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+//.. static HReg newVRegF ( ISelEnv* env )
+//.. {
+//..    HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/);
+//..    env->vreg_ctr++;
+//..    return reg;
+//.. }
+
+static HReg newVRegV ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Forward declarations                        ---*/
+/*---------------------------------------------------------*/
+
+/* These are organised as iselXXX and iselXXX_wrk pairs.  The
+   iselXXX_wrk do the real work, but are not to be called directly.
+   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
+   checks that all returned registers are virtual.  You should not
+   call the _wrk version directly.
+*/
+static AMD64RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
+static AMD64RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
+
+static AMD64RI*      iselIntExpr_RI_wrk  ( ISelEnv* env, IRExpr* e );
+static AMD64RI*      iselIntExpr_RI      ( ISelEnv* env, IRExpr* e );
+
+static AMD64RM*      iselIntExpr_RM_wrk  ( ISelEnv* env, IRExpr* e );
+static AMD64RM*      iselIntExpr_RM      ( ISelEnv* env, IRExpr* e );
+
+static HReg          iselIntExpr_R_wrk   ( ISelEnv* env, IRExpr* e );
+static HReg          iselIntExpr_R       ( ISelEnv* env, IRExpr* e );
+
+static AMD64AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
+static AMD64AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
+
+static void          iselInt128Expr_wrk ( HReg* rHi, HReg* rLo, 
+                                          ISelEnv* env, IRExpr* e );
+static void          iselInt128Expr     ( HReg* rHi, HReg* rLo, 
+                                          ISelEnv* env, IRExpr* e );
+
+static AMD64CondCode iselCondCode_wrk    ( ISelEnv* env, IRExpr* e );
+static AMD64CondCode iselCondCode        ( ISelEnv* env, IRExpr* e );
+
+static HReg          iselDblExpr_wrk     ( ISelEnv* env, IRExpr* e );
+static HReg          iselDblExpr         ( ISelEnv* env, IRExpr* e );
+
+static HReg          iselFltExpr_wrk     ( ISelEnv* env, IRExpr* e );
+static HReg          iselFltExpr         ( ISelEnv* env, IRExpr* e );
+
+static HReg          iselVecExpr_wrk     ( ISelEnv* env, IRExpr* e );
+static HReg          iselVecExpr         ( ISelEnv* env, IRExpr* e );
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Misc helpers                                ---*/
+/*---------------------------------------------------------*/
+
+static Bool sane_AMode ( AMD64AMode* am )
+{
+   switch (am->tag) {
+      case Aam_IR:
+         return 
+            toBool( hregClass(am->Aam.IR.reg) == HRcInt64
+                    && (hregIsVirtual(am->Aam.IR.reg)
+                        || am->Aam.IR.reg == hregAMD64_RBP()) );
+      case Aam_IRRS:
+         return 
+            toBool( hregClass(am->Aam.IRRS.base) == HRcInt64
+                    && hregIsVirtual(am->Aam.IRRS.base)
+                    && hregClass(am->Aam.IRRS.index) == HRcInt64
+                    && hregIsVirtual(am->Aam.IRRS.index) );
+      default:
+        vpanic("sane_AMode: unknown amd64 amode tag");
+   }
+}
+
+
+/* Can the lower 32 bits be signedly widened to produce the whole
+   64-bit value?  In other words, are the top 33 bits either all 0 or
+   all 1 ? */
+static Bool fitsIn32Bits ( ULong x )
+{
+   Long y0 = (Long)x;
+   Long y1 = y0;
+   y1 <<= 32;
+   y1 >>=/*s*/ 32;
+   return toBool(x == y1);
+}
+
+/* Is this a 64-bit zero expression? */
+
+static Bool isZeroU64 ( IRExpr* e )
+{
+   return e->tag == Iex_Const
+          && e->Iex.Const.con->tag == Ico_U64
+          && e->Iex.Const.con->Ico.U64 == 0ULL;
+}
+
+static Bool isZeroU32 ( IRExpr* e )
+{
+   return e->tag == Iex_Const
+          && e->Iex.Const.con->tag == Ico_U32
+          && e->Iex.Const.con->Ico.U32 == 0;
+}
+
+/* Make a int reg-reg move. */
+
+static AMD64Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
+{
+   vassert(hregClass(src) == HRcInt64);
+   vassert(hregClass(dst) == HRcInt64);
+   return AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst);
+}
+
+/* Make a vector reg-reg move. */
+
+static AMD64Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
+{
+   vassert(hregClass(src) == HRcVec128);
+   vassert(hregClass(dst) == HRcVec128);
+   return AMD64Instr_SseReRg(Asse_MOV, src, dst);
+}
+
+/* Advance/retreat %rsp by n. */
+
+static void add_to_rsp ( ISelEnv* env, Int n )
+{
+   vassert(n > 0 && n < 256 && (n%8) == 0);
+   addInstr(env, 
+            AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(n), 
+                                        hregAMD64_RSP()));
+}
+
+static void sub_from_rsp ( ISelEnv* env, Int n )
+{
+   vassert(n > 0 && n < 256 && (n%8) == 0);
+   addInstr(env, 
+            AMD64Instr_Alu64R(Aalu_SUB, AMD64RMI_Imm(n), 
+                                        hregAMD64_RSP()));
+}
+
+/* Push 64-bit constants on the stack. */
+static void push_uimm64( ISelEnv* env, ULong uimm64 )
+{
+   /* If uimm64 can be expressed as the sign extension of its
+      lower 32 bits, we can do it the easy way. */
+   Long simm64 = (Long)uimm64;
+   if ( simm64 == ((simm64 << 32) >> 32) ) {
+      addInstr( env, AMD64Instr_Push(AMD64RMI_Imm( (UInt)uimm64 )) );
+   } else {
+      HReg tmp = newVRegI(env);
+      addInstr( env, AMD64Instr_Imm64(uimm64, tmp) );
+      addInstr( env, AMD64Instr_Push(AMD64RMI_Reg(tmp)) );
+   }
+}
+
+//.. /* Given an amode, return one which references 4 bytes further
+//..    along. */
+//.. 
+//.. static X86AMode* advance4 ( X86AMode* am )
+//.. {
+//..    X86AMode* am4 = dopyX86AMode(am);
+//..    switch (am4->tag) {
+//..       case Xam_IRRS:
+//..          am4->Xam.IRRS.imm += 4; break;
+//..       case Xam_IR:
+//..          am4->Xam.IR.imm += 4; break;
+//..       default:
+//..          vpanic("advance4(x86,host)");
+//..    }
+//..    return am4;
+//.. }
+//.. 
+//.. 
+//.. /* Push an arg onto the host stack, in preparation for a call to a
+//..    helper function of some kind.  Returns the number of 32-bit words
+//..    pushed. */
+//.. 
+//.. static Int pushArg ( ISelEnv* env, IRExpr* arg )
+//.. {
+//..    IRType arg_ty = typeOfIRExpr(env->type_env, arg);
+//..    if (arg_ty == Ity_I32) {
+//..       addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg)));
+//..       return 1;
+//..    } else 
+//..    if (arg_ty == Ity_I64) {
+//..       HReg rHi, rLo;
+//..       iselInt64Expr(&rHi, &rLo, env, arg);
+//..       addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
+//..       addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
+//..       return 2;
+//..    }
+//..    ppIRExpr(arg);
+//..    vpanic("pushArg(x86): can't handle arg of this type");
+//.. }
+
+
+/* Used only in doHelperCall.  If possible, produce a single
+   instruction which computes 'e' into 'dst'.  If not possible, return
+   NULL. */
+
+static AMD64Instr* iselIntExpr_single_instruction ( ISelEnv* env,
+                                                    HReg     dst,
+                                                    IRExpr*  e )
+{
+   vassert(typeOfIRExpr(env->type_env, e) == Ity_I64);
+
+   if (e->tag == Iex_Const) {
+      vassert(e->Iex.Const.con->tag == Ico_U64);
+      if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
+         return AMD64Instr_Alu64R(
+                   Aalu_MOV,
+                   AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64)),
+                   dst
+                );
+      } else {
+         return AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, dst);
+      }
+   }
+
+   if (e->tag == Iex_RdTmp) {
+      HReg src = lookupIRTemp(env, e->Iex.RdTmp.tmp);
+      return mk_iMOVsd_RR(src, dst);
+   }
+
+   if (e->tag == Iex_Get) {
+      vassert(e->Iex.Get.ty == Ity_I64);
+      return AMD64Instr_Alu64R(
+                Aalu_MOV,
+                AMD64RMI_Mem(
+                   AMD64AMode_IR(e->Iex.Get.offset,
+                                 hregAMD64_RBP())),
+                dst);
+   }
+
+   if (e->tag == Iex_Unop 
+       && e->Iex.Unop.op == Iop_32Uto64 
+       && e->Iex.Unop.arg->tag == Iex_RdTmp) {
+      HReg src = lookupIRTemp(env, e->Iex.Unop.arg->Iex.RdTmp.tmp);
+      return AMD64Instr_MovxLQ(False, src, dst);
+   }
+
+   if (0) { ppIRExpr(e); vex_printf("\n"); }
+
+   return NULL;
+}
+
+
+/* Do a complete function call.  guard is a Ity_Bit expression
+   indicating whether or not the call happens.  If guard==NULL, the
+   call is unconditional. */
+
+static
+void doHelperCall ( ISelEnv* env, 
+                    Bool passBBP, 
+                    IRExpr* guard, IRCallee* cee, IRExpr** args )
+{
+   AMD64CondCode cc;
+   HReg          argregs[6];
+   HReg          tmpregs[6];
+   AMD64Instr*   fastinstrs[6];
+   Int           n_args, i, argreg;
+
+   /* Marshal args for a call and do the call.
+
+      If passBBP is True, %rbp (the baseblock pointer) is to be passed
+      as the first arg.
+
+      This function only deals with a tiny set of possibilities, which
+      cover all helpers in practice.  The restrictions are that only
+      arguments in registers are supported, hence only 6x64 integer
+      bits in total can be passed.  In fact the only supported arg
+      type is I64.
+
+      Generating code which is both efficient and correct when
+      parameters are to be passed in registers is difficult, for the
+      reasons elaborated in detail in comments attached to
+      doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
+      of the method described in those comments.
+
+      The problem is split into two cases: the fast scheme and the
+      slow scheme.  In the fast scheme, arguments are computed
+      directly into the target (real) registers.  This is only safe
+      when we can be sure that computation of each argument will not
+      trash any real registers set by computation of any other
+      argument.
+
+      In the slow scheme, all args are first computed into vregs, and
+      once they are all done, they are moved to the relevant real
+      regs.  This always gives correct code, but it also gives a bunch
+      of vreg-to-rreg moves which are usually redundant but are hard
+      for the register allocator to get rid of.
+
+      To decide which scheme to use, all argument expressions are
+      first examined.  If they are all so simple that it is clear they
+      will be evaluated without use of any fixed registers, use the
+      fast scheme, else use the slow scheme.  Note also that only
+      unconditional calls may use the fast scheme, since having to
+      compute a condition expression could itself trash real
+      registers.
+
+      Note this requires being able to examine an expression and
+      determine whether or not evaluation of it might use a fixed
+      register.  That requires knowledge of how the rest of this insn
+      selector works.  Currently just the following 3 are regarded as
+      safe -- hopefully they cover the majority of arguments in
+      practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
+   */
+
+   /* Note that the cee->regparms field is meaningless on AMD64 host
+      (since there is only one calling convention) and so we always
+      ignore it. */
+
+   n_args = 0;
+   for (i = 0; args[i]; i++)
+      n_args++;
+
+   if (6 < n_args + (passBBP ? 1 : 0))
+      vpanic("doHelperCall(AMD64): cannot currently handle > 6 args");
+
+   argregs[0] = hregAMD64_RDI();
+   argregs[1] = hregAMD64_RSI();
+   argregs[2] = hregAMD64_RDX();
+   argregs[3] = hregAMD64_RCX();
+   argregs[4] = hregAMD64_R8();
+   argregs[5] = hregAMD64_R9();
+
+   tmpregs[0] = tmpregs[1] = tmpregs[2] =
+   tmpregs[3] = tmpregs[4] = tmpregs[5] = INVALID_HREG;
+
+   fastinstrs[0] = fastinstrs[1] = fastinstrs[2] =
+   fastinstrs[3] = fastinstrs[4] = fastinstrs[5] = NULL;
+
+   /* First decide which scheme (slow or fast) is to be used.  First
+      assume the fast scheme, and select slow if any contraindications
+      (wow) appear. */
+
+   if (guard) {
+      if (guard->tag == Iex_Const 
+          && guard->Iex.Const.con->tag == Ico_U1
+          && guard->Iex.Const.con->Ico.U1 == True) {
+         /* unconditional */
+      } else {
+         /* Not manifestly unconditional -- be conservative. */
+         goto slowscheme;
+      }
+   }
+
+   /* Ok, let's try for the fast scheme.  If it doesn't pan out, we'll
+      use the slow scheme.  Because this is tentative, we can't call
+      addInstr (that is, commit to) any instructions until we're
+      handled all the arguments.  So park the resulting instructions
+      in a buffer and emit that if we're successful. */
+
+   /* FAST SCHEME */
+   argreg = 0;
+   if (passBBP) {
+      fastinstrs[argreg] = mk_iMOVsd_RR( hregAMD64_RBP(), argregs[argreg]);
+      argreg++;
+   }
+
+   for (i = 0; i < n_args; i++) {
+      vassert(argreg < 6);
+      vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
+      fastinstrs[argreg] 
+         = iselIntExpr_single_instruction( env, argregs[argreg], args[i] );
+      if (fastinstrs[argreg] == NULL)
+         goto slowscheme;
+      argreg++;
+   }
+
+   /* Looks like we're in luck.  Emit the accumulated instructions and
+      move on to doing the call itself. */
+   vassert(argreg <= 6);
+   for (i = 0; i < argreg; i++)
+      addInstr(env, fastinstrs[i]);
+
+   /* Fast scheme only applies for unconditional calls.  Hence: */
+   cc = Acc_ALWAYS;
+
+   goto handle_call;
+
+
+   /* SLOW SCHEME; move via temporaries */
+  slowscheme:
+#if 0
+if (n_args > 0) {for (i = 0; args[i]; i++) {
+ppIRExpr(args[i]); vex_printf(" "); }
+vex_printf("\n");}
+#endif
+   argreg = 0;
+
+   if (passBBP) {
+      /* This is pretty stupid; better to move directly to rdi
+         after the rest of the args are done. */
+      tmpregs[argreg] = newVRegI(env);
+      addInstr(env, mk_iMOVsd_RR( hregAMD64_RBP(), tmpregs[argreg]));
+      argreg++;
+   }
+
+   for (i = 0; i < n_args; i++) {
+      vassert(argreg < 6);
+      vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
+      tmpregs[argreg] = iselIntExpr_R(env, args[i]);
+      argreg++;
+   }
+
+   /* Now we can compute the condition.  We can't do it earlier
+      because the argument computations could trash the condition
+      codes.  Be a bit clever to handle the common case where the
+      guard is 1:Bit. */
+   cc = Acc_ALWAYS;
+   if (guard) {
+      if (guard->tag == Iex_Const 
+          && guard->Iex.Const.con->tag == Ico_U1
+          && guard->Iex.Const.con->Ico.U1 == True) {
+         /* unconditional -- do nothing */
+      } else {
+         cc = iselCondCode( env, guard );
+      }
+   }
+
+   /* Move the args to their final destinations. */
+   for (i = 0; i < argreg; i++) {
+      /* None of these insns, including any spill code that might
+         be generated, may alter the condition codes. */
+      addInstr( env, mk_iMOVsd_RR( tmpregs[i], argregs[i] ) );
+   }
+
+
+   /* Finally, the call itself. */
+  handle_call:
+   addInstr(env, AMD64Instr_Call( 
+                    cc, 
+                    Ptr_to_ULong(cee->addr), 
+                    n_args + (passBBP ? 1 : 0) 
+                 )
+   );
+}
+
+
+/* Given a guest-state array descriptor, an index expression and a
+   bias, generate an AMD64AMode holding the relevant guest state
+   offset. */
+
+static
+AMD64AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr, 
+                                  IRExpr* off, Int bias )
+{
+   HReg tmp, roff;
+   Int  elemSz = sizeofIRType(descr->elemTy);
+   Int  nElems = descr->nElems;
+
+   /* Throw out any cases not generated by an amd64 front end.  In
+      theory there might be a day where we need to handle them -- if
+      we ever run non-amd64-guest on amd64 host. */
+
+   if (nElems != 8 || (elemSz != 1 && elemSz != 8))
+      vpanic("genGuestArrayOffset(amd64 host)");
+
+   /* Compute off into a reg, %off.  Then return:
+
+         movq %off, %tmp
+         addq $bias, %tmp  (if bias != 0)
+         andq %tmp, 7
+         ... base(%rbp, %tmp, shift) ...
+   */
+   tmp  = newVRegI(env);
+   roff = iselIntExpr_R(env, off);
+   addInstr(env, mk_iMOVsd_RR(roff, tmp));
+   if (bias != 0) {
+      /* Make sure the bias is sane, in the sense that there are
+         no significant bits above bit 30 in it. */
+      vassert(-10000 < bias && bias < 10000);
+      addInstr(env, 
+               AMD64Instr_Alu64R(Aalu_ADD, AMD64RMI_Imm(bias), tmp));
+   }
+   addInstr(env, 
+            AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(7), tmp));
+   vassert(elemSz == 1 || elemSz == 8);
+   return
+      AMD64AMode_IRRS( descr->base, hregAMD64_RBP(), tmp,
+                                    elemSz==8 ? 3 : 0);
+}
+
+
+/* Set the SSE unit's rounding mode to default (%mxcsr = 0x1F80) */
+static
+void set_SSE_rounding_default ( ISelEnv* env )
+{
+   /* pushq $DEFAULT_MXCSR 
+      ldmxcsr 0(%rsp)
+      addq $8, %rsp
+   */
+   AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
+   addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(DEFAULT_MXCSR)));
+   addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
+   add_to_rsp(env, 8);
+}
+
+/* Mess with the FPU's rounding mode: set to the default rounding mode
+   (DEFAULT_FPUCW). */
+static 
+void set_FPU_rounding_default ( ISelEnv* env )
+{
+   /* movq $DEFAULT_FPUCW, -8(%rsp)
+      fldcw -8(%esp)
+   */
+   AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
+   addInstr(env, AMD64Instr_Alu64M(
+                    Aalu_MOV, AMD64RI_Imm(DEFAULT_FPUCW), m8_rsp));
+   addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
+}
+
+
+/* Mess with the SSE unit's rounding mode: 'mode' is an I32-typed
+   expression denoting a value in the range 0 .. 3, indicating a round
+   mode encoded as per type IRRoundingMode.  Set the SSE machinery to
+   have the same rounding.
+*/
+static
+void set_SSE_rounding_mode ( ISelEnv* env, IRExpr* mode )
+{
+   /* Note: this sequence only makes sense because DEFAULT_MXCSR has
+      both rounding bits == 0.  If that wasn't the case, we couldn't
+      create a new rounding field simply by ORing the new value into
+      place. */
+
+   /* movq $3, %reg
+      andq [[mode]], %reg  -- shouldn't be needed; paranoia
+      shlq $13, %reg
+      orq $DEFAULT_MXCSR, %reg
+      pushq %reg
+      ldmxcsr 0(%esp)
+      addq $8, %rsp
+   */      
+   HReg        reg      = newVRegI(env);
+   AMD64AMode* zero_rsp = AMD64AMode_IR(0, hregAMD64_RSP());
+   addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Imm(3), reg));
+   addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+                                   iselIntExpr_RMI(env, mode), reg));
+   addInstr(env, AMD64Instr_Sh64(Ash_SHL, 13, reg));
+   addInstr(env, AMD64Instr_Alu64R(
+                    Aalu_OR, AMD64RMI_Imm(DEFAULT_MXCSR), reg));
+   addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(reg)));
+   addInstr(env, AMD64Instr_LdMXCSR(zero_rsp));
+   add_to_rsp(env, 8);
+}
+
+
+/* Mess with the FPU's rounding mode: 'mode' is an I32-typed
+   expression denoting a value in the range 0 .. 3, indicating a round
+   mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
+   the same rounding.
+*/
+static
+void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
+{
+   HReg rrm  = iselIntExpr_R(env, mode);
+   HReg rrm2 = newVRegI(env);
+   AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
+
+   /* movq  %rrm, %rrm2
+      andq  $3, %rrm2   -- shouldn't be needed; paranoia
+      shlq  $10, %rrm2
+      orq   $DEFAULT_FPUCW, %rrm2
+      movq  %rrm2, -8(%rsp)
+      fldcw -8(%esp)
+   */
+   addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
+   addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(3), rrm2));
+   addInstr(env, AMD64Instr_Sh64(Ash_SHL, 10, rrm2));
+   addInstr(env, AMD64Instr_Alu64R(Aalu_OR, 
+                                   AMD64RMI_Imm(DEFAULT_FPUCW), rrm2));
+   addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, 
+                                   AMD64RI_Reg(rrm2), m8_rsp));
+   addInstr(env, AMD64Instr_A87LdCW(m8_rsp));
+}
+
+
+/* Generate all-zeroes into a new vector register.
+*/
+static HReg generate_zeroes_V128 ( ISelEnv* env )
+{
+   HReg dst = newVRegV(env);
+   addInstr(env, AMD64Instr_SseReRg(Asse_XOR, dst, dst));
+   return dst;
+}
+
+/* Generate all-ones into a new vector register.
+*/
+static HReg generate_ones_V128 ( ISelEnv* env )
+{
+   HReg dst = newVRegV(env);
+   addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, dst, dst));
+   return dst;
+}
+
+
+/* Generate !src into a new vector register.  Amazing that there isn't
+   a less crappy way to do this.
+*/
+static HReg do_sse_NotV128 ( ISelEnv* env, HReg src )
+{
+   HReg dst = generate_ones_V128(env);
+   addInstr(env, AMD64Instr_SseReRg(Asse_XOR, src, dst));
+   return dst;
+}
+
+
+/* Expand the given byte into a 64-bit word, by cloning each bit
+   8 times. */
+static ULong bitmask8_to_bytemask64 ( UShort w8 )
+{
+   vassert(w8 == (w8 & 0xFF));
+   ULong w64 = 0;
+   Int i;
+   for (i = 0; i < 8; i++) {
+      if (w8 & (1<<i))
+         w64 |= (0xFFULL << (8 * i));
+   }
+   return w64;
+}
+
+
+//.. /* Round an x87 FPU value to 53-bit-mantissa precision, to be used
+//..    after most non-simple FPU operations (simple = +, -, *, / and
+//..    sqrt).
+//.. 
+//..    This could be done a lot more efficiently if needed, by loading
+//..    zero and adding it to the value to be rounded (fldz ; faddp?).
+//.. */
+//.. static void roundToF64 ( ISelEnv* env, HReg reg )
+//.. {
+//..    X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+//..    sub_from_esp(env, 8);
+//..    addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp));
+//..    addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp));
+//..    add_to_esp(env, 8);
+//.. }
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
+/*---------------------------------------------------------*/
+
+/* Select insns for an integer-typed expression, and add them to the
+   code list.  Return a reg holding the result.  This reg will be a
+   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
+   want to modify it, ask for a new vreg, copy it in there, and modify
+   the copy.  The register allocator will do its best to map both
+   vregs to the same real register, so the copies will often disappear
+   later in the game.
+
+   This should handle expressions of 64, 32, 16 and 8-bit type.  All
+   results are returned in a 64-bit register.  For 32-, 16- and 8-bit
+   expressions, the upper 32/16/24 bits are arbitrary, so you should
+   mask or sign extend partial values if necessary.
+*/
+
+static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselIntExpr_R_wrk(env, e);
+   /* sanity checks ... */
+#  if 0
+   vex_printf("\niselIntExpr_R: "); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcInt64);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
+{
+   /* Used for unary/binary SIMD64 ops. */
+   HWord fn = 0;
+   Bool second_is_UInt;
+
+   MatchInfo mi;
+   DECLARE_PATTERN(p_1Uto8_64to1);
+   DECLARE_PATTERN(p_LDle8_then_8Uto64);
+   DECLARE_PATTERN(p_LDle16_then_16Uto64);
+
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I32 || Ity_I16 || Ity_I8);
+
+   switch (e->tag) {
+
+   /* --------- TEMP --------- */
+   case Iex_RdTmp: {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   /* --------- LOAD --------- */
+   case Iex_Load: {
+      HReg dst = newVRegI(env);
+      AMD64AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
+
+      /* We can't handle big-endian loads, nor load-linked. */
+      if (e->Iex.Load.end != Iend_LE)
+         goto irreducible;
+
+      if (ty == Ity_I64) {
+         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,
+                                         AMD64RMI_Mem(amode), dst) );
+         return dst;
+      }
+      if (ty == Ity_I32) {
+         addInstr(env, AMD64Instr_LoadEX(4,False,amode,dst));
+         return dst;
+      }
+      if (ty == Ity_I16) {
+         addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
+         return dst;
+      }
+      if (ty == Ity_I8) {
+         addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
+         return dst;
+      }
+      break;
+   }
+
+   /* --------- BINARY OP --------- */
+   case Iex_Binop: {
+      AMD64AluOp   aluOp;
+      AMD64ShiftOp shOp;
+
+      /* Pattern: Sub64(0,x) */
+      /*     and: Sub32(0,x) */
+      if ((e->Iex.Binop.op == Iop_Sub64 && isZeroU64(e->Iex.Binop.arg1))
+          || (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1))) {
+         HReg dst = newVRegI(env);
+         HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, mk_iMOVsd_RR(reg,dst));
+         addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
+         return dst;
+      }
+
+      /* Is it an addition or logical style op? */
+      switch (e->Iex.Binop.op) {
+         case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64: 
+            aluOp = Aalu_ADD; break;
+         case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
+            aluOp = Aalu_SUB; break;
+         case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64: 
+            aluOp = Aalu_AND; break;
+         case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64: 
+            aluOp = Aalu_OR; break;
+         case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64: 
+            aluOp = Aalu_XOR; break;
+         case Iop_Mul16: case Iop_Mul32: case Iop_Mul64:
+            aluOp = Aalu_MUL; break;
+         default:
+            aluOp = Aalu_INVALID; break;
+      }
+      /* For commutative ops we assume any literal
+         values are on the second operand. */
+      if (aluOp != Aalu_INVALID) {
+         HReg dst      = newVRegI(env);
+         HReg reg      = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         AMD64RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+         addInstr(env, mk_iMOVsd_RR(reg,dst));
+         addInstr(env, AMD64Instr_Alu64R(aluOp, rmi, dst));
+         return dst;
+      }
+
+      /* Perhaps a shift op? */
+      switch (e->Iex.Binop.op) {
+         case Iop_Shl64: case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
+            shOp = Ash_SHL; break;
+         case Iop_Shr64: case Iop_Shr32: case Iop_Shr16: case Iop_Shr8: 
+            shOp = Ash_SHR; break;
+         case Iop_Sar64: case Iop_Sar32: case Iop_Sar16: case Iop_Sar8: 
+            shOp = Ash_SAR; break;
+         default:
+            shOp = Ash_INVALID; break;
+      }
+      if (shOp != Ash_INVALID) {
+         HReg dst = newVRegI(env);
+
+         /* regL = the value to be shifted */
+         HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         addInstr(env, mk_iMOVsd_RR(regL,dst));
+
+         /* Do any necessary widening for 32/16/8 bit operands */
+         switch (e->Iex.Binop.op) {
+            case Iop_Shr64: case Iop_Shl64: case Iop_Sar64: 
+               break;
+            case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
+               break;
+            case Iop_Shr8:
+               addInstr(env, AMD64Instr_Alu64R(
+                                Aalu_AND, AMD64RMI_Imm(0xFF), dst));
+               break;
+            case Iop_Shr16:
+               addInstr(env, AMD64Instr_Alu64R(
+                                Aalu_AND, AMD64RMI_Imm(0xFFFF), dst));
+               break;
+            case Iop_Shr32:
+               addInstr(env, AMD64Instr_MovxLQ(False, dst, dst));
+               break;
+            case Iop_Sar8:
+               addInstr(env, AMD64Instr_Sh64(Ash_SHL, 56, dst));
+               addInstr(env, AMD64Instr_Sh64(Ash_SAR, 56, dst));
+               break;
+            case Iop_Sar16:
+               addInstr(env, AMD64Instr_Sh64(Ash_SHL, 48, dst));
+               addInstr(env, AMD64Instr_Sh64(Ash_SAR, 48, dst));
+               break;
+            case Iop_Sar32:
+               addInstr(env, AMD64Instr_MovxLQ(True, dst, dst));
+               break;
+            default: 
+               ppIROp(e->Iex.Binop.op);
+               vassert(0);
+         }
+
+         /* Now consider the shift amount.  If it's a literal, we
+            can do a much better job than the general case. */
+         if (e->Iex.Binop.arg2->tag == Iex_Const) {
+            /* assert that the IR is well-typed */
+            Int nshift;
+            vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
+            nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+            vassert(nshift >= 0);
+            if (nshift > 0)
+               /* Can't allow nshift==0 since that means %cl */
+               addInstr(env, AMD64Instr_Sh64(shOp, nshift, dst));
+         } else {
+            /* General case; we have to force the amount into %cl. */
+            HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            addInstr(env, mk_iMOVsd_RR(regR,hregAMD64_RCX()));
+            addInstr(env, AMD64Instr_Sh64(shOp, 0/* %cl */, dst));
+         }
+         return dst;
+      }
+
+      /* Deal with 64-bit SIMD binary ops */
+      second_is_UInt = False;
+      switch (e->Iex.Binop.op) {
+         case Iop_Add8x8:
+            fn = (HWord)h_generic_calc_Add8x8; break;
+         case Iop_Add16x4:
+            fn = (HWord)h_generic_calc_Add16x4; break;
+         case Iop_Add32x2:
+            fn = (HWord)h_generic_calc_Add32x2; break;
+
+         case Iop_Avg8Ux8:
+            fn = (HWord)h_generic_calc_Avg8Ux8; break;
+         case Iop_Avg16Ux4:
+            fn = (HWord)h_generic_calc_Avg16Ux4; break;
+
+         case Iop_CmpEQ8x8:
+            fn = (HWord)h_generic_calc_CmpEQ8x8; break;
+         case Iop_CmpEQ16x4:
+            fn = (HWord)h_generic_calc_CmpEQ16x4; break;
+         case Iop_CmpEQ32x2:
+            fn = (HWord)h_generic_calc_CmpEQ32x2; break;
+
+         case Iop_CmpGT8Sx8:
+            fn = (HWord)h_generic_calc_CmpGT8Sx8; break;
+         case Iop_CmpGT16Sx4:
+            fn = (HWord)h_generic_calc_CmpGT16Sx4; break;
+         case Iop_CmpGT32Sx2:
+            fn = (HWord)h_generic_calc_CmpGT32Sx2; break;
+
+         case Iop_InterleaveHI8x8:
+            fn = (HWord)h_generic_calc_InterleaveHI8x8; break;
+         case Iop_InterleaveLO8x8:
+            fn = (HWord)h_generic_calc_InterleaveLO8x8; break;
+         case Iop_InterleaveHI16x4:
+            fn = (HWord)h_generic_calc_InterleaveHI16x4; break;
+         case Iop_InterleaveLO16x4:
+            fn = (HWord)h_generic_calc_InterleaveLO16x4; break;
+         case Iop_InterleaveHI32x2:
+            fn = (HWord)h_generic_calc_InterleaveHI32x2; break;
+         case Iop_InterleaveLO32x2:
+            fn = (HWord)h_generic_calc_InterleaveLO32x2; break;
+         case Iop_CatOddLanes16x4:
+            fn = (HWord)h_generic_calc_CatOddLanes16x4; break;
+         case Iop_CatEvenLanes16x4:
+            fn = (HWord)h_generic_calc_CatEvenLanes16x4; break;
+         case Iop_Perm8x8:
+            fn = (HWord)h_generic_calc_Perm8x8; break;
+
+         case Iop_Max8Ux8:
+            fn = (HWord)h_generic_calc_Max8Ux8; break;
+         case Iop_Max16Sx4:
+            fn = (HWord)h_generic_calc_Max16Sx4; break;
+         case Iop_Min8Ux8:
+            fn = (HWord)h_generic_calc_Min8Ux8; break;
+         case Iop_Min16Sx4:
+            fn = (HWord)h_generic_calc_Min16Sx4; break;
+
+         case Iop_Mul16x4:
+            fn = (HWord)h_generic_calc_Mul16x4; break;
+         case Iop_Mul32x2:
+            fn = (HWord)h_generic_calc_Mul32x2; break;
+         case Iop_MulHi16Sx4:
+            fn = (HWord)h_generic_calc_MulHi16Sx4; break;
+         case Iop_MulHi16Ux4:
+            fn = (HWord)h_generic_calc_MulHi16Ux4; break;
+
+         case Iop_QAdd8Sx8:
+            fn = (HWord)h_generic_calc_QAdd8Sx8; break;
+         case Iop_QAdd16Sx4:
+            fn = (HWord)h_generic_calc_QAdd16Sx4; break;
+         case Iop_QAdd8Ux8:
+            fn = (HWord)h_generic_calc_QAdd8Ux8; break;
+         case Iop_QAdd16Ux4:
+            fn = (HWord)h_generic_calc_QAdd16Ux4; break;
+
+         case Iop_QNarrow32Sx2:
+            fn = (HWord)h_generic_calc_QNarrow32Sx2; break;
+         case Iop_QNarrow16Sx4:
+            fn = (HWord)h_generic_calc_QNarrow16Sx4; break;
+         case Iop_QNarrow16Ux4:
+            fn = (HWord)h_generic_calc_QNarrow16Ux4; break;
+
+         case Iop_QSub8Sx8:
+            fn = (HWord)h_generic_calc_QSub8Sx8; break;
+         case Iop_QSub16Sx4:
+            fn = (HWord)h_generic_calc_QSub16Sx4; break;
+         case Iop_QSub8Ux8:
+            fn = (HWord)h_generic_calc_QSub8Ux8; break;
+         case Iop_QSub16Ux4:
+            fn = (HWord)h_generic_calc_QSub16Ux4; break;
+
+         case Iop_Sub8x8:
+            fn = (HWord)h_generic_calc_Sub8x8; break;
+         case Iop_Sub16x4:
+            fn = (HWord)h_generic_calc_Sub16x4; break;
+         case Iop_Sub32x2:
+            fn = (HWord)h_generic_calc_Sub32x2; break;
+
+         case Iop_ShlN32x2:
+            fn = (HWord)h_generic_calc_ShlN32x2; 
+            second_is_UInt = True;
+            break;
+         case Iop_ShlN16x4:
+            fn = (HWord)h_generic_calc_ShlN16x4;
+            second_is_UInt = True;
+            break;
+         case Iop_ShlN8x8:
+            fn = (HWord)h_generic_calc_ShlN8x8;
+            second_is_UInt = True;
+            break;
+         case Iop_ShrN32x2:
+            fn = (HWord)h_generic_calc_ShrN32x2; 
+            second_is_UInt = True; 
+            break;
+         case Iop_ShrN16x4:
+            fn = (HWord)h_generic_calc_ShrN16x4;
+            second_is_UInt = True; 
+            break;
+         case Iop_SarN32x2:
+            fn = (HWord)h_generic_calc_SarN32x2;
+            second_is_UInt = True; 
+            break;
+         case Iop_SarN16x4:
+            fn = (HWord)h_generic_calc_SarN16x4;
+            second_is_UInt = True; 
+            break;
+         case Iop_SarN8x8:
+            fn = (HWord)h_generic_calc_SarN8x8;
+            second_is_UInt = True; 
+            break;
+
+         default:
+            fn = (HWord)0; break;
+      }
+      if (fn != (HWord)0) {
+         /* Note: the following assumes all helpers are of signature 
+               ULong fn ( ULong, ULong ), and they are
+            not marked as regparm functions. 
+         */
+         HReg dst  = newVRegI(env);
+         HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         if (second_is_UInt)
+            addInstr(env, AMD64Instr_MovxLQ(False, argR, argR));
+         addInstr(env, mk_iMOVsd_RR(argL, hregAMD64_RDI()) );
+         addInstr(env, mk_iMOVsd_RR(argR, hregAMD64_RSI()) );
+         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 2 ));
+         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
+         return dst;
+      }
+
+      /* Handle misc other ops. */
+
+      if (e->Iex.Binop.op == Iop_Max32U) {
+         /* This generates a truly rotten piece of code.  Just as well
+            it doesn't happen very often. */
+         HReg src1  = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg src1L = newVRegI(env);
+         HReg src2  = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         HReg src2L = newVRegI(env);
+         HReg dst   = newVRegI(env);
+         addInstr(env, mk_iMOVsd_RR(src1,dst));
+         addInstr(env, mk_iMOVsd_RR(src1,src1L));
+         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, src1L));
+         addInstr(env, mk_iMOVsd_RR(src2,src2L));
+         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, src2L));
+         addInstr(env, AMD64Instr_Alu64R(Aalu_CMP, AMD64RMI_Reg(src2L), src1L));
+         addInstr(env, AMD64Instr_CMov64(Acc_B, AMD64RM_Reg(src2), dst));
+         return dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_DivModS64to32
+          || e->Iex.Binop.op == Iop_DivModU64to32) {
+         /* 64 x 32 -> (32(rem),32(div)) division */
+         /* Get the 64-bit operand into edx:eax, and the other into
+            any old R/M. */
+         HReg      rax     = hregAMD64_RAX();
+         HReg      rdx     = hregAMD64_RDX();
+         HReg      dst     = newVRegI(env);
+         Bool      syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
+         AMD64RM*  rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
+         /* Compute the left operand into a reg, and then 
+            put the top half in edx and the bottom in eax. */
+         HReg left64 = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         addInstr(env, mk_iMOVsd_RR(left64, rdx));
+         addInstr(env, mk_iMOVsd_RR(left64, rax));
+         addInstr(env, AMD64Instr_Sh64(Ash_SHR, 32, rdx));
+         addInstr(env, AMD64Instr_Div(syned, 4, rmRight));
+	 addInstr(env, AMD64Instr_MovxLQ(False, rdx, rdx));
+	 addInstr(env, AMD64Instr_MovxLQ(False, rax, rax));
+         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, rdx));
+         addInstr(env, mk_iMOVsd_RR(rax, dst));
+         addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(rdx), dst));
+         return dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_32HLto64) {
+         HReg hi32  = newVRegI(env);
+         HReg lo32  = newVRegI(env);
+         HReg hi32s = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg lo32s = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, mk_iMOVsd_RR(hi32s, hi32));
+         addInstr(env, mk_iMOVsd_RR(lo32s, lo32));
+         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, hi32));
+	 addInstr(env, AMD64Instr_MovxLQ(False, lo32, lo32));
+         addInstr(env, AMD64Instr_Alu64R(
+                          Aalu_OR, AMD64RMI_Reg(lo32), hi32));
+         return hi32;
+      }
+
+      if (e->Iex.Binop.op == Iop_16HLto32) {
+         HReg hi16  = newVRegI(env);
+         HReg lo16  = newVRegI(env);
+         HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
+         addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
+         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 16, hi16));
+         addInstr(env, AMD64Instr_Alu64R(
+                          Aalu_AND, AMD64RMI_Imm(0xFFFF), lo16));
+         addInstr(env, AMD64Instr_Alu64R(
+                          Aalu_OR, AMD64RMI_Reg(lo16), hi16));
+         return hi16;
+      }
+
+      if (e->Iex.Binop.op == Iop_8HLto16) {
+         HReg hi8  = newVRegI(env);
+         HReg lo8  = newVRegI(env);
+         HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
+         addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
+         addInstr(env, AMD64Instr_Sh64(Ash_SHL, 8, hi8));
+         addInstr(env, AMD64Instr_Alu64R(
+                          Aalu_AND, AMD64RMI_Imm(0xFF), lo8));
+         addInstr(env, AMD64Instr_Alu64R(
+                          Aalu_OR, AMD64RMI_Reg(lo8), hi8));
+         return hi8;
+      }
+
+      if (e->Iex.Binop.op == Iop_MullS32
+          || e->Iex.Binop.op == Iop_MullS16
+          || e->Iex.Binop.op == Iop_MullS8
+          || e->Iex.Binop.op == Iop_MullU32 
+          || e->Iex.Binop.op == Iop_MullU16 
+          || e->Iex.Binop.op == Iop_MullU8) {
+         HReg a32   = newVRegI(env);
+         HReg b32   = newVRegI(env);
+         HReg a32s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg b32s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         Int          shift  = 0;
+         AMD64ShiftOp shr_op = Ash_SHR;
+         switch (e->Iex.Binop.op) {
+            case Iop_MullS32: shr_op = Ash_SAR; shift = 32; break;
+            case Iop_MullS16: shr_op = Ash_SAR; shift = 48; break;
+            case Iop_MullS8:  shr_op = Ash_SAR; shift = 56; break;
+            case Iop_MullU32: shr_op = Ash_SHR; shift = 32; break;
+            case Iop_MullU16: shr_op = Ash_SHR; shift = 48; break;
+            case Iop_MullU8:  shr_op = Ash_SHR; shift = 56; break;
+            default: vassert(0);
+         }
+
+         addInstr(env, mk_iMOVsd_RR(a32s, a32));
+         addInstr(env, mk_iMOVsd_RR(b32s, b32));
+         addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, a32));
+         addInstr(env, AMD64Instr_Sh64(Ash_SHL, shift, b32));
+         addInstr(env, AMD64Instr_Sh64(shr_op,  shift, a32));
+         addInstr(env, AMD64Instr_Sh64(shr_op,  shift, b32));
+         addInstr(env, AMD64Instr_Alu64R(Aalu_MUL, AMD64RMI_Reg(a32), b32));
+         return b32;
+      }
+
+      if (e->Iex.Binop.op == Iop_CmpF64) {
+         HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
+         HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegI(env);
+         addInstr(env, AMD64Instr_SseUComIS(8,fL,fR,dst));
+         /* Mask out irrelevant parts of the result so as to conform
+            to the CmpF64 definition. */
+         addInstr(env, AMD64Instr_Alu64R(Aalu_AND, AMD64RMI_Imm(0x45), dst));
+         return dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_F64toI32S
+          || e->Iex.Binop.op == Iop_F64toI64S) {
+         Int  szD = e->Iex.Binop.op==Iop_F64toI32S ? 4 : 8;
+         HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegI(env);
+         set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
+         addInstr(env, AMD64Instr_SseSF2SI( 8, szD, rf, dst ));
+         set_SSE_rounding_default(env);
+         return dst;
+      }
+
+//..       if (e->Iex.Binop.op == Iop_F64toI32 || e->Iex.Binop.op == Iop_F64toI16) {
+//..          Int  sz  = e->Iex.Binop.op == Iop_F64toI16 ? 2 : 4;
+//..          HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
+//..          HReg dst = newVRegI(env);
+//.. 
+//..          /* Used several times ... */
+//..          X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+//.. 
+//..          /* rf now holds the value to be converted, and rrm holds the
+//.. 	    rounding mode value, encoded as per the IRRoundingMode
+//.. 	    enum.  The first thing to do is set the FPU's rounding
+//.. 	    mode accordingly. */
+//.. 
+//..          /* Create a space for the format conversion. */
+//..          /* subl $4, %esp */
+//..          sub_from_esp(env, 4);
+//.. 
+//.. 	 /* Set host rounding mode */
+//.. 	 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+//.. 
+//..          /* gistw/l %rf, 0(%esp) */
+//..          addInstr(env, X86Instr_FpLdStI(False/*store*/, sz, rf, zero_esp));
+//.. 
+//..          if (sz == 2) {
+//..             /* movzwl 0(%esp), %dst */
+//..             addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst));
+//..          } else {
+//..             /* movl 0(%esp), %dst */
+//..             vassert(sz == 4);
+//..             addInstr(env, X86Instr_Alu32R(
+//..                              Xalu_MOV, X86RMI_Mem(zero_esp), dst));
+//..          }
+//.. 
+//.. 	 /* Restore default FPU rounding. */
+//..          set_FPU_rounding_default( env );
+//.. 
+//..          /* addl $4, %esp */
+//.. 	 add_to_esp(env, 4);
+//..          return dst;
+//..       }
+//.. 
+//..       /* C3210 flags following FPU partial remainder (fprem), both
+//..          IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
+//..       if (e->Iex.Binop.op == Iop_PRemC3210F64
+//..           || e->Iex.Binop.op == Iop_PRem1C3210F64) {
+//..          HReg junk = newVRegF(env);
+//..          HReg dst  = newVRegI(env);
+//..          HReg srcL = iselDblExpr(env, e->Iex.Binop.arg1);
+//..          HReg srcR = iselDblExpr(env, e->Iex.Binop.arg2);
+//..          addInstr(env, X86Instr_FpBinary(
+//..                            e->Iex.Binop.op==Iop_PRemC3210F64 
+//..                               ? Xfp_PREM : Xfp_PREM1,
+//..                            srcL,srcR,junk
+//..                  ));
+//..          /* The previous pseudo-insn will have left the FPU's C3210
+//..             flags set correctly.  So bag them. */
+//..          addInstr(env, X86Instr_FpStSW_AX());
+//..          addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
+//.. 	 addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
+//..          return dst;
+//..       }
+
+      break;
+   }
+
+   /* --------- UNARY OP --------- */
+   case Iex_Unop: {
+
+      /* 1Uto8(64to1(expr64)) */
+      {
+         DEFINE_PATTERN( p_1Uto8_64to1,
+                         unop(Iop_1Uto8, unop(Iop_64to1, bind(0))) );
+         if (matchIRExpr(&mi,p_1Uto8_64to1,e)) {
+            IRExpr* expr64 = mi.bindee[0];
+            HReg    dst    = newVRegI(env);
+            HReg    src    = iselIntExpr_R(env, expr64);
+            addInstr(env, mk_iMOVsd_RR(src,dst) );
+            addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+                                            AMD64RMI_Imm(1), dst));
+            return dst;
+         }
+      }
+
+      /* 8Uto64(LDle(expr64)) */
+      {
+         DEFINE_PATTERN(p_LDle8_then_8Uto64,
+                        unop(Iop_8Uto64,
+                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
+         if (matchIRExpr(&mi,p_LDle8_then_8Uto64,e)) {
+            HReg dst = newVRegI(env);
+            AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
+            addInstr(env, AMD64Instr_LoadEX(1,False,amode,dst));
+            return dst;
+         }
+      }
+
+      /* 16Uto64(LDle(expr64)) */
+      {
+         DEFINE_PATTERN(p_LDle16_then_16Uto64,
+                        unop(Iop_16Uto64,
+                             IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
+         if (matchIRExpr(&mi,p_LDle16_then_16Uto64,e)) {
+            HReg dst = newVRegI(env);
+            AMD64AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
+            addInstr(env, AMD64Instr_LoadEX(2,False,amode,dst));
+            return dst;
+         }
+      }
+
+      switch (e->Iex.Unop.op) {
+         case Iop_32Uto64:
+         case Iop_32Sto64: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, AMD64Instr_MovxLQ(e->Iex.Unop.op == Iop_32Sto64,
+                                            src, dst) );
+            return dst;
+         }
+         case Iop_128HIto64: {
+            HReg rHi, rLo;
+            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+            return rHi; /* and abandon rLo */
+         }
+         case Iop_128to64: {
+            HReg rHi, rLo;
+            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+            return rLo; /* and abandon rHi */
+         }
+         case Iop_8Uto16:
+         case Iop_8Uto32:
+         case Iop_8Uto64:
+         case Iop_16Uto64:
+         case Iop_16Uto32: {
+            HReg dst     = newVRegI(env);
+            HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
+            Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Uto32
+                                   || e->Iex.Unop.op==Iop_16Uto64 );
+            UInt mask    = srcIs16 ? 0xFFFF : 0xFF;
+            addInstr(env, mk_iMOVsd_RR(src,dst) );
+            addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+                                            AMD64RMI_Imm(mask), dst));
+            return dst;
+         }
+         case Iop_8Sto16:
+         case Iop_8Sto64:
+         case Iop_8Sto32:
+         case Iop_16Sto32:
+         case Iop_16Sto64: {
+            HReg dst     = newVRegI(env);
+            HReg src     = iselIntExpr_R(env, e->Iex.Unop.arg);
+            Bool srcIs16 = toBool( e->Iex.Unop.op==Iop_16Sto32
+                                   || e->Iex.Unop.op==Iop_16Sto64 );
+            UInt amt     = srcIs16 ? 48 : 56;
+            addInstr(env, mk_iMOVsd_RR(src,dst) );
+            addInstr(env, AMD64Instr_Sh64(Ash_SHL, amt, dst));
+            addInstr(env, AMD64Instr_Sh64(Ash_SAR, amt, dst));
+            return dst;
+         }
+ 	 case Iop_Not8:
+ 	 case Iop_Not16:
+         case Iop_Not32:
+         case Iop_Not64: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, mk_iMOVsd_RR(src,dst) );
+            addInstr(env, AMD64Instr_Unary64(Aun_NOT,dst));
+            return dst;
+         }
+//..          case Iop_64HIto32: {
+//..             HReg rHi, rLo;
+//..             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+//..             return rHi; /* and abandon rLo .. poor wee thing :-) */
+//..          }
+//..          case Iop_64to32: {
+//..             HReg rHi, rLo;
+//..             iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+//..             return rLo; /* similar stupid comment to the above ... */
+//..          }
+         case Iop_16HIto8:
+         case Iop_32HIto16:
+         case Iop_64HIto32: {
+            HReg dst  = newVRegI(env);
+            HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
+            Int shift = 0;
+            switch (e->Iex.Unop.op) {
+               case Iop_16HIto8:  shift = 8;  break;
+               case Iop_32HIto16: shift = 16; break;
+               case Iop_64HIto32: shift = 32; break;
+               default: vassert(0);
+            }
+            addInstr(env, mk_iMOVsd_RR(src,dst) );
+            addInstr(env, AMD64Instr_Sh64(Ash_SHR, shift, dst));
+            return dst;
+         }
+         case Iop_1Uto64:
+         case Iop_1Uto32:
+         case Iop_1Uto8: {
+            HReg dst           = newVRegI(env);
+            AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+            addInstr(env, AMD64Instr_Set64(cond,dst));
+            return dst;
+         }
+         case Iop_1Sto8:
+         case Iop_1Sto16:
+         case Iop_1Sto32:
+         case Iop_1Sto64: {
+            /* could do better than this, but for now ... */
+            HReg dst           = newVRegI(env);
+            AMD64CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+            addInstr(env, AMD64Instr_Set64(cond,dst));
+            addInstr(env, AMD64Instr_Sh64(Ash_SHL, 63, dst));
+            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
+            return dst;
+         }
+         case Iop_Ctz64: {
+            /* Count trailing zeroes, implemented by amd64 'bsfq' */
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, AMD64Instr_Bsfr64(True,src,dst));
+            return dst;
+         }
+         case Iop_Clz64: {
+            /* Count leading zeroes.  Do 'bsrq' to establish the index
+               of the highest set bit, and subtract that value from
+               63. */
+            HReg tmp = newVRegI(env);
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, AMD64Instr_Bsfr64(False,src,tmp));
+            addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, 
+                                            AMD64RMI_Imm(63), dst));
+            addInstr(env, AMD64Instr_Alu64R(Aalu_SUB,
+                                            AMD64RMI_Reg(tmp), dst));
+            return dst;
+         }
+
+         case Iop_CmpwNEZ64: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, mk_iMOVsd_RR(src,dst));
+            addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
+            addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
+                                            AMD64RMI_Reg(src), dst));
+            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
+            return dst;
+         }
+
+         case Iop_CmpwNEZ32: {
+            HReg src = newVRegI(env);
+            HReg dst = newVRegI(env);
+            HReg pre = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, mk_iMOVsd_RR(pre,src));
+            addInstr(env, AMD64Instr_MovxLQ(False, src, src));
+            addInstr(env, mk_iMOVsd_RR(src,dst));
+            addInstr(env, AMD64Instr_Unary64(Aun_NEG,dst));
+            addInstr(env, AMD64Instr_Alu64R(Aalu_OR,
+                                            AMD64RMI_Reg(src), dst));
+            addInstr(env, AMD64Instr_Sh64(Ash_SAR, 63, dst));
+            return dst;
+         }
+
+         case Iop_Left8:
+         case Iop_Left16:
+         case Iop_Left32:
+         case Iop_Left64: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, mk_iMOVsd_RR(src, dst));
+            addInstr(env, AMD64Instr_Unary64(Aun_NEG, dst));
+            addInstr(env, AMD64Instr_Alu64R(Aalu_OR, AMD64RMI_Reg(src), dst));
+            return dst;
+         }
+
+         case Iop_V128to32: {
+            HReg        dst     = newVRegI(env);
+            HReg        vec     = iselVecExpr(env, e->Iex.Unop.arg);
+            AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
+            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp_m16));
+            addInstr(env, AMD64Instr_LoadEX(4, False/*z-widen*/, rsp_m16, dst));
+            return dst;
+         }
+
+         /* V128{HI}to64 */
+         case Iop_V128HIto64:
+         case Iop_V128to64: {
+            Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0;
+            HReg dst = newVRegI(env);
+            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
+            AMD64AMode* rsp0 = AMD64AMode_IR(0,   hregAMD64_RSP());
+            AMD64AMode* rspN = AMD64AMode_IR(off, hregAMD64_RSP());
+            sub_from_rsp(env, 16);
+            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, rsp0));
+            addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, 
+                                             AMD64RMI_Mem(rspN), dst ));
+            add_to_rsp(env, 16);
+            return dst;
+         }
+
+         /* ReinterpF64asI64(e) */
+         /* Given an IEEE754 double, produce an I64 with the same bit
+            pattern. */
+         case Iop_ReinterpF64asI64: {
+            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
+            HReg        dst    = newVRegI(env);
+            HReg        src    = iselDblExpr(env, e->Iex.Unop.arg);
+            /* paranoia */
+            set_SSE_rounding_default(env);
+            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, src, m8_rsp));
+            addInstr(env, AMD64Instr_Alu64R(
+                             Aalu_MOV, AMD64RMI_Mem(m8_rsp), dst));
+            return dst;
+         }
+
+         /* ReinterpF32asI32(e) */
+         /* Given an IEEE754 single, produce an I64 with the same bit
+            pattern in the lower half. */
+         case Iop_ReinterpF32asI32: {
+            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
+            HReg        dst    = newVRegI(env);
+            HReg        src    = iselFltExpr(env, e->Iex.Unop.arg);
+            /* paranoia */
+            set_SSE_rounding_default(env);
+            addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, src, m8_rsp));
+            addInstr(env, AMD64Instr_LoadEX(4, False/*unsigned*/, m8_rsp, dst ));
+            return dst;
+         }
+
+         case Iop_16to8:
+         case Iop_32to8:
+         case Iop_64to8:
+         case Iop_32to16:
+         case Iop_64to16:
+         case Iop_64to32:
+            /* These are no-ops. */
+            return iselIntExpr_R(env, e->Iex.Unop.arg);
+
+         default: 
+            break;
+      }
+
+      /* Deal with unary 64-bit SIMD ops. */
+      switch (e->Iex.Unop.op) {
+         case Iop_CmpNEZ32x2:
+            fn = (HWord)h_generic_calc_CmpNEZ32x2; break;
+         case Iop_CmpNEZ16x4:
+            fn = (HWord)h_generic_calc_CmpNEZ16x4; break;
+         case Iop_CmpNEZ8x8:
+            fn = (HWord)h_generic_calc_CmpNEZ8x8; break;
+         default:
+            fn = (HWord)0; break;
+      }
+      if (fn != (HWord)0) {
+         /* Note: the following assumes all helpers are of
+            signature 
+               ULong fn ( ULong ), and they are
+            not marked as regparm functions. 
+         */
+         HReg dst = newVRegI(env);
+         HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
+         addInstr(env, mk_iMOVsd_RR(arg, hregAMD64_RDI()) );
+         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 1 ));
+         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
+         return dst;
+      }
+
+      break;
+   }
+
+   /* --------- GET --------- */
+   case Iex_Get: {
+      if (ty == Ity_I64) {
+         HReg dst = newVRegI(env);
+         addInstr(env, AMD64Instr_Alu64R(
+                          Aalu_MOV, 
+                          AMD64RMI_Mem(
+                             AMD64AMode_IR(e->Iex.Get.offset,
+                                           hregAMD64_RBP())),
+                          dst));
+         return dst;
+      }
+      if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
+         HReg dst = newVRegI(env);
+         addInstr(env, AMD64Instr_LoadEX(
+                          toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
+                          False,
+                          AMD64AMode_IR(e->Iex.Get.offset,hregAMD64_RBP()),
+                          dst));
+         return dst;
+      }
+      break;
+   }
+
+   case Iex_GetI: {
+      AMD64AMode* am 
+         = genGuestArrayOffset(
+              env, e->Iex.GetI.descr, 
+                   e->Iex.GetI.ix, e->Iex.GetI.bias );
+      HReg dst = newVRegI(env);
+      if (ty == Ity_I8) {
+         addInstr(env, AMD64Instr_LoadEX( 1, False, am, dst ));
+         return dst;
+      }
+      if (ty == Ity_I64) {
+         addInstr(env, AMD64Instr_Alu64R( Aalu_MOV, AMD64RMI_Mem(am), dst ));
+         return dst;
+      }
+      break;
+   }
+
+   /* --------- CCALL --------- */
+   case Iex_CCall: {
+      HReg    dst = newVRegI(env);
+      vassert(ty == e->Iex.CCall.retty);
+
+      /* be very restrictive for now.  Only 64-bit ints allowed
+         for args, and 64 or 32 bits for return type. */
+      if (e->Iex.CCall.retty != Ity_I64 && e->Iex.CCall.retty != Ity_I32)
+         goto irreducible;
+
+      /* Marshal args, do the call. */
+      doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
+
+      /* Move to dst, and zero out the top 32 bits if the result type is
+         Ity_I32.  Probably overkill, but still .. */
+      if (e->Iex.CCall.retty == Ity_I64)
+         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), dst));
+      else
+         addInstr(env, AMD64Instr_MovxLQ(False, hregAMD64_RAX(), dst));
+
+      return dst;
+   }
+
+   /* --------- LITERAL --------- */
+   /* 64/32/16/8-bit literals */
+   case Iex_Const:
+      if (ty == Ity_I64) {
+         HReg r = newVRegI(env);
+         addInstr(env, AMD64Instr_Imm64(e->Iex.Const.con->Ico.U64, r));
+         return r;
+      } else {
+         AMD64RMI* rmi = iselIntExpr_RMI ( env, e );
+         HReg      r   = newVRegI(env);
+         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, rmi, r));
+         return r;
+      }
+
+   /* --------- MULTIPLEX --------- */
+   case Iex_Mux0X: {
+     if ((ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
+         && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
+        HReg     r8;
+        HReg     rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
+        AMD64RM* r0  = iselIntExpr_RM(env, e->Iex.Mux0X.expr0);
+        HReg dst = newVRegI(env);
+        addInstr(env, mk_iMOVsd_RR(rX,dst));
+        r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond);
+        addInstr(env, AMD64Instr_Test64(0xFF, r8));
+        addInstr(env, AMD64Instr_CMov64(Acc_Z,r0,dst));
+        return dst;
+      }
+      break;
+   }
+
+   /* --------- TERNARY OP --------- */
+   case Iex_Triop: {
+      /* C3210 flags following FPU partial remainder (fprem), both
+         IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
+      if (e->Iex.Triop.op == Iop_PRemC3210F64
+          || e->Iex.Triop.op == Iop_PRem1C3210F64) {
+         AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
+         HReg        arg1   = iselDblExpr(env, e->Iex.Triop.arg2);
+         HReg        arg2   = iselDblExpr(env, e->Iex.Triop.arg3);
+         HReg        dst    = newVRegI(env);
+         addInstr(env, AMD64Instr_A87Free(2));
+
+         /* one arg -> top of x87 stack */
+         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg2, m8_rsp));
+         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
+
+         /* other arg -> top of x87 stack */
+         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg1, m8_rsp));
+         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
+
+         switch (e->Iex.Triop.op) {
+            case Iop_PRemC3210F64:
+               addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
+               break;
+            case Iop_PRem1C3210F64:
+               addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
+               break;
+            default: 
+               vassert(0);
+         }
+         /* Ignore the result, and instead make off with the FPU's
+	    C3210 flags (in the status word). */
+         addInstr(env, AMD64Instr_A87StSW(m8_rsp));
+         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Mem(m8_rsp),dst));
+         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0x4700),dst));
+         return dst;
+      }
+      break;
+   }
+
+   default: 
+   break;
+   } /* switch (e->tag) */
+
+   /* We get here if no pattern matched. */
+  irreducible:
+   ppIRExpr(e);
+   vpanic("iselIntExpr_R(amd64): cannot reduce tree");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expression auxiliaries              ---*/
+/*---------------------------------------------------------*/
+
+/* --------------------- AMODEs --------------------- */
+
+/* Return an AMode which computes the value of the specified
+   expression, possibly also adding insns to the code list as a
+   result.  The expression may only be a 32-bit one.
+*/
+
+static AMD64AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
+{
+   AMD64AMode* am = iselIntExpr_AMode_wrk(env, e);
+   vassert(sane_AMode(am));
+   return am;
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static AMD64AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
+{
+   MatchInfo mi;
+   DECLARE_PATTERN(p_complex);
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I64);
+
+   /* Add64( Add64(expr1, Shl64(expr2, imm8)), simm32 ) */
+   /*              bind0        bind1  bind2   bind3   */
+   DEFINE_PATTERN(p_complex,
+      binop( Iop_Add64,
+             binop( Iop_Add64, 
+                    bind(0), 
+                    binop(Iop_Shl64, bind(1), bind(2))
+                  ),
+             bind(3)
+           )
+   );
+   if (matchIRExpr(&mi, p_complex, e)) {
+      IRExpr* expr1  = mi.bindee[0];
+      IRExpr* expr2  = mi.bindee[1];
+      IRExpr* imm8   = mi.bindee[2];
+      IRExpr* simm32 = mi.bindee[3];
+      if (imm8->tag == Iex_Const 
+          && imm8->Iex.Const.con->tag == Ico_U8
+          && imm8->Iex.Const.con->Ico.U8 < 4
+          /* imm8 is OK, now check simm32 */
+          && simm32->tag == Iex_Const
+          && simm32->Iex.Const.con->tag == Ico_U64
+          && fitsIn32Bits(simm32->Iex.Const.con->Ico.U64)) {
+         UInt shift = imm8->Iex.Const.con->Ico.U8;
+         UInt offset = toUInt(simm32->Iex.Const.con->Ico.U64);
+         HReg r1 = iselIntExpr_R(env, expr1);
+         HReg r2 = iselIntExpr_R(env, expr2);
+         vassert(shift == 0 || shift == 1 || shift == 2 || shift == 3);
+         return AMD64AMode_IRRS(offset, r1, r2, shift);
+      }
+   }
+
+   /* Add64(expr1, Shl64(expr2, imm)) */
+   if (e->tag == Iex_Binop
+       && e->Iex.Binop.op == Iop_Add64
+       && e->Iex.Binop.arg2->tag == Iex_Binop
+       && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl64
+       && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
+       && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
+      UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+      if (shift == 1 || shift == 2 || shift == 3) {
+         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
+         return AMD64AMode_IRRS(0, r1, r2, shift);
+      }
+   }
+
+   /* Add64(expr,i) */
+   if (e->tag == Iex_Binop 
+       && e->Iex.Binop.op == Iop_Add64
+       && e->Iex.Binop.arg2->tag == Iex_Const
+       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
+       && fitsIn32Bits(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)) {
+      HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      return AMD64AMode_IR(
+                toUInt(e->Iex.Binop.arg2->Iex.Const.con->Ico.U64), 
+                r1
+             );
+   }
+
+   /* Doesn't match anything in particular.  Generate it into
+      a register and use that. */
+   {
+      HReg r1 = iselIntExpr_R(env, e);
+      return AMD64AMode_IR(0, r1);
+   }
+}
+
+
+/* --------------------- RMIs --------------------- */
+
+/* Similarly, calculate an expression into an X86RMI operand.  As with
+   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
+
+static AMD64RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
+{
+   AMD64RMI* rmi = iselIntExpr_RMI_wrk(env, e);
+   /* sanity checks ... */
+   switch (rmi->tag) {
+      case Armi_Imm:
+         return rmi;
+      case Armi_Reg:
+         vassert(hregClass(rmi->Armi.Reg.reg) == HRcInt64);
+         vassert(hregIsVirtual(rmi->Armi.Reg.reg));
+         return rmi;
+      case Armi_Mem:
+         vassert(sane_AMode(rmi->Armi.Mem.am));
+         return rmi;
+      default:
+         vpanic("iselIntExpr_RMI: unknown amd64 RMI tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static AMD64RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I64 || ty == Ity_I32 
+           || ty == Ity_I16 || ty == Ity_I8);
+
+   /* special case: immediate 64/32/16/8 */
+   if (e->tag == Iex_Const) {
+      switch (e->Iex.Const.con->tag) {
+        case Ico_U64:
+           if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
+              return AMD64RMI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
+           }
+           break;
+         case Ico_U32:
+            return AMD64RMI_Imm(e->Iex.Const.con->Ico.U32); break;
+         case Ico_U16:
+            return AMD64RMI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16); break;
+         case Ico_U8:
+            return AMD64RMI_Imm(0xFF & e->Iex.Const.con->Ico.U8); break;
+         default:
+            vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
+      }
+   }
+
+   /* special case: 64-bit GET */
+   if (e->tag == Iex_Get && ty == Ity_I64) {
+      return AMD64RMI_Mem(AMD64AMode_IR(e->Iex.Get.offset,
+                                        hregAMD64_RBP()));
+   }
+
+   /* special case: 64-bit load from memory */
+   if (e->tag == Iex_Load && ty == Ity_I64
+       && e->Iex.Load.end == Iend_LE) {
+      AMD64AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
+      return AMD64RMI_Mem(am);
+   }
+
+   /* default case: calculate into a register and return that */
+   {
+      HReg r = iselIntExpr_R ( env, e );
+      return AMD64RMI_Reg(r);
+   }
+}
+
+
+/* --------------------- RIs --------------------- */
+
+/* Calculate an expression into an AMD64RI operand.  As with
+   iselIntExpr_R, the expression can have type 64, 32, 16 or 8
+   bits. */
+
+static AMD64RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
+{
+   AMD64RI* ri = iselIntExpr_RI_wrk(env, e);
+   /* sanity checks ... */
+   switch (ri->tag) {
+      case Ari_Imm:
+         return ri;
+      case Ari_Reg:
+         vassert(hregClass(ri->Ari.Reg.reg) == HRcInt64);
+         vassert(hregIsVirtual(ri->Ari.Reg.reg));
+         return ri;
+      default:
+         vpanic("iselIntExpr_RI: unknown amd64 RI tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static AMD64RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I64 || ty == Ity_I32 
+           || ty == Ity_I16 || ty == Ity_I8);
+
+   /* special case: immediate */
+   if (e->tag == Iex_Const) {
+      switch (e->Iex.Const.con->tag) {
+        case Ico_U64:
+           if (fitsIn32Bits(e->Iex.Const.con->Ico.U64)) {
+              return AMD64RI_Imm(toUInt(e->Iex.Const.con->Ico.U64));
+           }
+           break;
+         case Ico_U32:
+            return AMD64RI_Imm(e->Iex.Const.con->Ico.U32);
+         case Ico_U16: 
+            return AMD64RI_Imm(0xFFFF & e->Iex.Const.con->Ico.U16);
+         case Ico_U8:
+            return AMD64RI_Imm(0xFF & e->Iex.Const.con->Ico.U8);
+         default:
+            vpanic("iselIntExpr_RMI.Iex_Const(amd64)");
+      }
+   }
+
+   /* default case: calculate into a register and return that */
+   {
+      HReg r = iselIntExpr_R ( env, e );
+      return AMD64RI_Reg(r);
+   }
+}
+
+
+/* --------------------- RMs --------------------- */
+
+/* Similarly, calculate an expression into an AMD64RM operand.  As
+   with iselIntExpr_R, the expression can have type 64, 32, 16 or 8
+   bits.  */
+
+static AMD64RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
+{
+   AMD64RM* rm = iselIntExpr_RM_wrk(env, e);
+   /* sanity checks ... */
+   switch (rm->tag) {
+      case Arm_Reg:
+         vassert(hregClass(rm->Arm.Reg.reg) == HRcInt64);
+         vassert(hregIsVirtual(rm->Arm.Reg.reg));
+         return rm;
+      case Arm_Mem:
+         vassert(sane_AMode(rm->Arm.Mem.am));
+         return rm;
+      default:
+         vpanic("iselIntExpr_RM: unknown amd64 RM tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static AMD64RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
+
+   /* special case: 64-bit GET */
+   if (e->tag == Iex_Get && ty == Ity_I64) {
+      return AMD64RM_Mem(AMD64AMode_IR(e->Iex.Get.offset,
+                                       hregAMD64_RBP()));
+   }
+
+   /* special case: load from memory */
+
+   /* default case: calculate into a register and return that */
+   {
+      HReg r = iselIntExpr_R ( env, e );
+      return AMD64RM_Reg(r);
+   }
+}
+
+
+/* --------------------- CONDCODE --------------------- */
+
+/* Generate code to evaluated a bit-typed expression, returning the
+   condition code which would correspond when the expression would
+   notionally have returned 1. */
+
+static AMD64CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
+{
+   /* Uh, there's nothing we can sanity check here, unfortunately. */
+   return iselCondCode_wrk(env,e);
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static AMD64CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
+{
+   MatchInfo mi;
+
+   vassert(e);
+   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
+
+   /* var */
+   if (e->tag == Iex_RdTmp) {
+      HReg r64 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
+      HReg dst = newVRegI(env);
+      addInstr(env, mk_iMOVsd_RR(r64,dst));
+      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(1),dst));
+      return Acc_NZ;
+   }
+
+   /* Constant 1:Bit */
+   if (e->tag == Iex_Const) {
+      HReg r;
+      vassert(e->Iex.Const.con->tag == Ico_U1);
+      vassert(e->Iex.Const.con->Ico.U1 == True 
+              || e->Iex.Const.con->Ico.U1 == False);
+      r = newVRegI(env);
+      addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,AMD64RMI_Imm(0),r));
+      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,AMD64RMI_Reg(r),r));
+      return e->Iex.Const.con->Ico.U1 ? Acc_Z : Acc_NZ;
+   }
+
+   /* Not1(...) */
+   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
+      /* Generate code for the arg, and negate the test condition */
+      return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
+   }
+
+   /* --- patterns rooted at: 64to1 --- */
+
+   /* 64to1 */
+   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_64to1) {
+      HReg reg = iselIntExpr_R(env, e->Iex.Unop.arg);
+      addInstr(env, AMD64Instr_Test64(1,reg));
+      return Acc_NZ;
+   }
+
+   /* --- patterns rooted at: CmpNEZ8 --- */
+
+   /* CmpNEZ8(x) */
+   if (e->tag == Iex_Unop 
+       && e->Iex.Unop.op == Iop_CmpNEZ8) {
+      HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
+      addInstr(env, AMD64Instr_Test64(0xFF,r));
+      return Acc_NZ;
+   }
+
+   /* --- patterns rooted at: CmpNEZ16 --- */
+
+   /* CmpNEZ16(x) */
+   if (e->tag == Iex_Unop 
+       && e->Iex.Unop.op == Iop_CmpNEZ16) {
+      HReg r = iselIntExpr_R(env, e->Iex.Unop.arg);
+      addInstr(env, AMD64Instr_Test64(0xFFFF,r));
+      return Acc_NZ;
+   }
+
+   /* --- patterns rooted at: CmpNEZ32 --- */
+
+   /* CmpNEZ32(x) */
+   if (e->tag == Iex_Unop 
+       && e->Iex.Unop.op == Iop_CmpNEZ32) {
+      HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
+      HReg      tmp  = newVRegI(env);
+      AMD64RMI* rmi2 = AMD64RMI_Imm(0);
+      addInstr(env, AMD64Instr_MovxLQ(False, r1, tmp));
+      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,tmp));
+      return Acc_NZ;
+   }
+
+   /* --- patterns rooted at: CmpNEZ64 --- */
+
+   /* CmpNEZ64(Or64(x,y)) */
+   {
+      DECLARE_PATTERN(p_CmpNEZ64_Or64);
+      DEFINE_PATTERN(p_CmpNEZ64_Or64,
+                     unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
+      if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
+         HReg      r0   = iselIntExpr_R(env, mi.bindee[0]);
+         AMD64RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
+         HReg      tmp  = newVRegI(env);
+         addInstr(env, mk_iMOVsd_RR(r0, tmp));
+         addInstr(env, AMD64Instr_Alu64R(Aalu_OR,rmi1,tmp));
+         return Acc_NZ;
+      }
+   }
+
+   /* CmpNEZ64(x) */
+   if (e->tag == Iex_Unop 
+       && e->Iex.Unop.op == Iop_CmpNEZ64) {
+      HReg      r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
+      AMD64RMI* rmi2 = AMD64RMI_Imm(0);
+      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
+      return Acc_NZ;
+   }
+
+   /* --- patterns rooted at: Cmp{EQ,NE}{8,16,32} --- */
+
+   /* CmpEQ8 / CmpNE8 */
+   if (e->tag == Iex_Binop 
+       && (e->Iex.Binop.op == Iop_CmpEQ8
+           || e->Iex.Binop.op == Iop_CmpNE8
+           || e->Iex.Binop.op == Iop_CasCmpEQ8
+           || e->Iex.Binop.op == Iop_CasCmpNE8)) {
+      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+      HReg      r    = newVRegI(env);
+      addInstr(env, mk_iMOVsd_RR(r1,r));
+      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
+      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFF),r));
+      switch (e->Iex.Binop.op) {
+         case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Acc_Z;
+         case Iop_CmpNE8: case Iop_CasCmpNE8: return Acc_NZ;
+         default: vpanic("iselCondCode(amd64): CmpXX8");
+      }
+   }
+
+   /* CmpEQ16 / CmpNE16 */
+   if (e->tag == Iex_Binop 
+       && (e->Iex.Binop.op == Iop_CmpEQ16
+           || e->Iex.Binop.op == Iop_CmpNE16
+           || e->Iex.Binop.op == Iop_CasCmpEQ16
+           || e->Iex.Binop.op == Iop_CasCmpNE16)) {
+      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+      HReg      r    = newVRegI(env);
+      addInstr(env, mk_iMOVsd_RR(r1,r));
+      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
+      addInstr(env, AMD64Instr_Alu64R(Aalu_AND,AMD64RMI_Imm(0xFFFF),r));
+      switch (e->Iex.Binop.op) {
+         case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Acc_Z;
+         case Iop_CmpNE16: case Iop_CasCmpNE16: return Acc_NZ;
+         default: vpanic("iselCondCode(amd64): CmpXX16");
+      }
+   }
+
+   /* CmpEQ32 / CmpNE32 */
+   if (e->tag == Iex_Binop 
+       && (e->Iex.Binop.op == Iop_CmpEQ32
+           || e->Iex.Binop.op == Iop_CmpNE32
+           || e->Iex.Binop.op == Iop_CasCmpEQ32
+           || e->Iex.Binop.op == Iop_CasCmpNE32)) {
+      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+      HReg      r    = newVRegI(env);
+      addInstr(env, mk_iMOVsd_RR(r1,r));
+      addInstr(env, AMD64Instr_Alu64R(Aalu_XOR,rmi2,r));
+      addInstr(env, AMD64Instr_Sh64(Ash_SHL, 32, r));
+      switch (e->Iex.Binop.op) {
+         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Acc_Z;
+         case Iop_CmpNE32: case Iop_CasCmpNE32: return Acc_NZ;
+         default: vpanic("iselCondCode(amd64): CmpXX32");
+      }
+   }
+
+   /* Cmp*64*(x,y) */
+   if (e->tag == Iex_Binop 
+       && (e->Iex.Binop.op == Iop_CmpEQ64
+           || e->Iex.Binop.op == Iop_CmpNE64
+           || e->Iex.Binop.op == Iop_CmpLT64S
+           || e->Iex.Binop.op == Iop_CmpLT64U
+           || e->Iex.Binop.op == Iop_CmpLE64S
+           || e->Iex.Binop.op == Iop_CmpLE64U
+           || e->Iex.Binop.op == Iop_CasCmpEQ64
+           || e->Iex.Binop.op == Iop_CasCmpNE64)) {
+      HReg      r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      AMD64RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+      addInstr(env, AMD64Instr_Alu64R(Aalu_CMP,rmi2,r1));
+      switch (e->Iex.Binop.op) {
+         case Iop_CmpEQ64: case Iop_CasCmpEQ64: return Acc_Z;
+         case Iop_CmpNE64: case Iop_CasCmpNE64: return Acc_NZ;
+	 case Iop_CmpLT64S: return Acc_L;
+	 case Iop_CmpLT64U: return Acc_B;
+	 case Iop_CmpLE64S: return Acc_LE;
+         case Iop_CmpLE64U: return Acc_BE;
+         default: vpanic("iselCondCode(amd64): CmpXX64");
+      }
+   }
+
+   ppIRExpr(e);
+   vpanic("iselCondCode(amd64)");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expressions (128 bit)               ---*/
+/*---------------------------------------------------------*/
+
+/* Compute a 128-bit value into a register pair, which is returned as
+   the first two parameters.  As with iselIntExpr_R, these may be
+   either real or virtual regs; in any case they must not be changed
+   by subsequent code emitted by the caller.  */
+
+static void iselInt128Expr ( HReg* rHi, HReg* rLo, 
+                             ISelEnv* env, IRExpr* e )
+{
+   iselInt128Expr_wrk(rHi, rLo, env, e);
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(*rHi) == HRcInt64);
+   vassert(hregIsVirtual(*rHi));
+   vassert(hregClass(*rLo) == HRcInt64);
+   vassert(hregIsVirtual(*rLo));
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo, 
+                                 ISelEnv* env, IRExpr* e )
+{
+//..    HWord fn = 0; /* helper fn for most SIMD64 stuff */
+   vassert(e);
+   vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
+
+//..    /* 64-bit literal */
+//..    if (e->tag == Iex_Const) {
+//..       ULong w64 = e->Iex.Const.con->Ico.U64;
+//..       UInt  wHi = ((UInt)(w64 >> 32)) & 0xFFFFFFFF;
+//..       UInt  wLo = ((UInt)w64) & 0xFFFFFFFF;
+//..       HReg  tLo = newVRegI(env);
+//..       HReg  tHi = newVRegI(env);
+//..       vassert(e->Iex.Const.con->tag == Ico_U64);
+//..       addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi));
+//..       addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
+//..       *rHi = tHi;
+//..       *rLo = tLo;
+//..       return;
+//..    }
+
+   /* read 128-bit IRTemp */
+   if (e->tag == Iex_RdTmp) {
+      lookupIRTemp128( rHi, rLo, env, e->Iex.RdTmp.tmp);
+      return;
+   }
+ 
+//..    /* 64-bit load */
+//..    if (e->tag == Iex_LDle) {
+//..       HReg     tLo, tHi;
+//..       X86AMode *am0, *am4;
+//..       vassert(e->Iex.LDle.ty == Ity_I64);
+//..       tLo = newVRegI(env);
+//..       tHi = newVRegI(env);
+//..       am0 = iselIntExpr_AMode(env, e->Iex.LDle.addr);
+//..       am4 = advance4(am0);
+//..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo ));
+//..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
+//..       *rHi = tHi;
+//..       *rLo = tLo;
+//..       return;
+//..    }
+//.. 
+//..    /* 64-bit GET */
+//..    if (e->tag == Iex_Get) {
+//..       X86AMode* am  = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP());
+//..       X86AMode* am4 = advance4(am);
+//..       HReg tLo = newVRegI(env);
+//..       HReg tHi = newVRegI(env);
+//..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
+//..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
+//..       *rHi = tHi;
+//..       *rLo = tLo;
+//..       return;
+//..    }
+//.. 
+//..    /* 64-bit GETI */
+//..    if (e->tag == Iex_GetI) {
+//..       X86AMode* am 
+//..          = genGuestArrayOffset( env, e->Iex.GetI.descr, 
+//..                                      e->Iex.GetI.ix, e->Iex.GetI.bias );
+//..       X86AMode* am4 = advance4(am);
+//..       HReg tLo = newVRegI(env);
+//..       HReg tHi = newVRegI(env);
+//..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
+//..       addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
+//..       *rHi = tHi;
+//..       *rLo = tLo;
+//..       return;
+//..    }
+//.. 
+//..    /* 64-bit Mux0X */
+//..    if (e->tag == Iex_Mux0X) {
+//..       HReg e0Lo, e0Hi, eXLo, eXHi, r8;
+//..       HReg tLo = newVRegI(env);
+//..       HReg tHi = newVRegI(env);
+//..       iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
+//..       iselInt64Expr(&eXHi, &eXLo, env, e->Iex.Mux0X.exprX);
+//..       addInstr(env, mk_iMOVsd_RR(eXHi, tHi));
+//..       addInstr(env, mk_iMOVsd_RR(eXLo, tLo));
+//..       r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond);
+//..       addInstr(env, X86Instr_Test32(X86RI_Imm(0xFF), X86RM_Reg(r8)));
+//..       /* This assumes the first cmov32 doesn't trash the condition
+//..          codes, so they are still available for the second cmov32 */
+//..       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Hi),tHi));
+//..       addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Lo),tLo));
+//..       *rHi = tHi;
+//..       *rLo = tLo;
+//..       return;
+//..    }
+
+   /* --------- BINARY ops --------- */
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+         /* 64 x 64 -> 128 multiply */
+         case Iop_MullU64:
+         case Iop_MullS64: {
+            /* get one operand into %rax, and the other into a R/M.
+               Need to make an educated guess about which is better in
+               which. */
+            HReg     tLo    = newVRegI(env);
+            HReg     tHi    = newVRegI(env);
+            Bool     syned  = toBool(e->Iex.Binop.op == Iop_MullS64);
+            AMD64RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
+            HReg     rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            addInstr(env, mk_iMOVsd_RR(rRight, hregAMD64_RAX()));
+            addInstr(env, AMD64Instr_MulL(syned, rmLeft));
+            /* Result is now in RDX:RAX.  Tell the caller. */
+            addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
+            addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* 128 x 64 -> (64(rem),64(div)) division */
+         case Iop_DivModU128to64:
+         case Iop_DivModS128to64: {
+            /* Get the 128-bit operand into rdx:rax, and the other into
+               any old R/M. */
+            HReg sHi, sLo;
+            HReg     tLo     = newVRegI(env);
+            HReg     tHi     = newVRegI(env);
+            Bool     syned   = toBool(e->Iex.Binop.op == Iop_DivModS128to64);
+            AMD64RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
+            iselInt128Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
+            addInstr(env, mk_iMOVsd_RR(sHi, hregAMD64_RDX()));
+            addInstr(env, mk_iMOVsd_RR(sLo, hregAMD64_RAX()));
+            addInstr(env, AMD64Instr_Div(syned, 8, rmRight));
+            addInstr(env, mk_iMOVsd_RR(hregAMD64_RDX(), tHi));
+            addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(), tLo));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* 64HLto128(e1,e2) */
+         case Iop_64HLto128:
+            *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            return;
+
+//..          /* Or64/And64/Xor64 */
+//..          case Iop_Or64:
+//..          case Iop_And64:
+//..          case Iop_Xor64: {
+//..             HReg xLo, xHi, yLo, yHi;
+//..             HReg tLo = newVRegI(env);
+//..             HReg tHi = newVRegI(env);
+//..             X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR
+//..                           : e->Iex.Binop.op==Iop_And64 ? Xalu_AND
+//..                           : Xalu_XOR;
+//..             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+//..             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
+//..             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
+//..             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
+//..             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
+//..             addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
+//..             *rHi = tHi;
+//..             *rLo = tLo;
+//..             return;
+//..          }
+//.. 
+//..          /* Add64/Sub64 */
+//..          case Iop_Add64:
+//..          case Iop_Sub64: {
+//..             HReg xLo, xHi, yLo, yHi;
+//..             HReg tLo = newVRegI(env);
+//..             HReg tHi = newVRegI(env);
+//..             iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+//..             addInstr(env, mk_iMOVsd_RR(xHi, tHi));
+//..             addInstr(env, mk_iMOVsd_RR(xLo, tLo));
+//..             iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
+//..             if (e->Iex.Binop.op==Iop_Add64) {
+//..                addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo));
+//..                addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi));
+//..             } else {
+//..                addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
+//..                addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
+//..             }
+//..             *rHi = tHi;
+//..             *rLo = tLo;
+//..             return;
+//..          }
+//.. 
+//..          /* 32HLto64(e1,e2) */
+//..          case Iop_32HLto64:
+//..             *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
+//..             *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
+//..             return;
+//.. 
+//..          /* 64-bit shifts */
+//..          case Iop_Shl64: {
+//..             /* We use the same ingenious scheme as gcc.  Put the value
+//..                to be shifted into %hi:%lo, and the shift amount into
+//..                %cl.  Then (dsts on right, a la ATT syntax):
+//..  
+//..                shldl %cl, %lo, %hi   -- make %hi be right for the
+//..                                      -- shift amt %cl % 32
+//..                shll  %cl, %lo        -- make %lo be right for the
+//..                                      -- shift amt %cl % 32
+//.. 
+//..                Now, if (shift amount % 64) is in the range 32 .. 63,
+//..                we have to do a fixup, which puts the result low half
+//..                into the result high half, and zeroes the low half:
+//.. 
+//..                testl $32, %ecx
+//.. 
+//..                cmovnz %lo, %hi
+//..                movl $0, %tmp         -- sigh; need yet another reg
+//..                cmovnz %tmp, %lo 
+//..             */
+//..             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
+//..             tLo = newVRegI(env);
+//..             tHi = newVRegI(env);
+//..             tTemp = newVRegI(env);
+//..             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
+//..             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
+//..             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
+//..             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
+//..             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
+//..             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
+//..                and those regs are legitimately modifiable. */
+//..             addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi));
+//..             addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, X86RM_Reg(tLo)));
+//..             addInstr(env, X86Instr_Test32(X86RI_Imm(32), 
+//..                           X86RM_Reg(hregX86_ECX())));
+//..             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi));
+//..             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
+//..             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo));
+//..             *rHi = tHi;
+//..             *rLo = tLo;
+//..             return;
+//..          }
+//.. 
+//..          case Iop_Shr64: {
+//..             /* We use the same ingenious scheme as gcc.  Put the value
+//..                to be shifted into %hi:%lo, and the shift amount into
+//..                %cl.  Then:
+//..  
+//..                shrdl %cl, %hi, %lo   -- make %lo be right for the
+//..                                      -- shift amt %cl % 32
+//..                shrl  %cl, %hi        -- make %hi be right for the
+//..                                      -- shift amt %cl % 32
+//.. 
+//..                Now, if (shift amount % 64) is in the range 32 .. 63,
+//..                we have to do a fixup, which puts the result high half
+//..                into the result low half, and zeroes the high half:
+//.. 
+//..                testl $32, %ecx
+//.. 
+//..                cmovnz %hi, %lo
+//..                movl $0, %tmp         -- sigh; need yet another reg
+//..                cmovnz %tmp, %hi
+//..             */
+//..             HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
+//..             tLo = newVRegI(env);
+//..             tHi = newVRegI(env);
+//..             tTemp = newVRegI(env);
+//..             rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
+//..             iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
+//..             addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
+//..             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
+//..             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
+//..             /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
+//..                and those regs are legitimately modifiable. */
+//..             addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo));
+//..             addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, X86RM_Reg(tHi)));
+//..             addInstr(env, X86Instr_Test32(X86RI_Imm(32), 
+//..                           X86RM_Reg(hregX86_ECX())));
+//..             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo));
+//..             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
+//..             addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi));
+//..             *rHi = tHi;
+//..             *rLo = tLo;
+//..             return;
+//..          }
+//.. 
+//..          /* F64 -> I64 */
+//..          /* Sigh, this is an almost exact copy of the F64 -> I32/I16
+//..             case.  Unfortunately I see no easy way to avoid the
+//..             duplication. */
+//..          case Iop_F64toI64: {
+//..             HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
+//..             HReg tLo = newVRegI(env);
+//..             HReg tHi = newVRegI(env);
+//.. 
+//..             /* Used several times ... */
+//..             /* Careful ... this sharing is only safe because
+//.. 	       zero_esp/four_esp do not hold any registers which the
+//.. 	       register allocator could attempt to swizzle later. */
+//..             X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+//..             X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
+//.. 
+//..             /* rf now holds the value to be converted, and rrm holds
+//..                the rounding mode value, encoded as per the
+//..                IRRoundingMode enum.  The first thing to do is set the
+//..                FPU's rounding mode accordingly. */
+//.. 
+//..             /* Create a space for the format conversion. */
+//..             /* subl $8, %esp */
+//..             sub_from_esp(env, 8);
+//.. 
+//..             /* Set host rounding mode */
+//..             set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+//.. 
+//..             /* gistll %rf, 0(%esp) */
+//..             addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp));
+//.. 
+//..             /* movl 0(%esp), %dstLo */
+//..             /* movl 4(%esp), %dstHi */
+//..             addInstr(env, X86Instr_Alu32R(
+//..                              Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
+//..             addInstr(env, X86Instr_Alu32R(
+//..                              Xalu_MOV, X86RMI_Mem(four_esp), tHi));
+//.. 
+//..             /* Restore default FPU rounding. */
+//..             set_FPU_rounding_default( env );
+//.. 
+//..             /* addl $8, %esp */
+//..             add_to_esp(env, 8);
+//.. 
+//..             *rHi = tHi;
+//..             *rLo = tLo;
+//..             return;
+//..          }
+//.. 
+         default: 
+            break;
+      }
+   } /* if (e->tag == Iex_Binop) */
+
+
+//..    /* --------- UNARY ops --------- */
+//..    if (e->tag == Iex_Unop) {
+//..       switch (e->Iex.Unop.op) {
+//.. 
+//..          /* 32Sto64(e) */
+//..          case Iop_32Sto64: {
+//..             HReg tLo = newVRegI(env);
+//..             HReg tHi = newVRegI(env);
+//..             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+//..             addInstr(env, mk_iMOVsd_RR(src,tHi));
+//..             addInstr(env, mk_iMOVsd_RR(src,tLo));
+//..             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, X86RM_Reg(tHi)));
+//..             *rHi = tHi;
+//..             *rLo = tLo;
+//..             return;
+//..          }
+//.. 
+//..          /* 32Uto64(e) */
+//..          case Iop_32Uto64: {
+//..             HReg tLo = newVRegI(env);
+//..             HReg tHi = newVRegI(env);
+//..             HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+//..             addInstr(env, mk_iMOVsd_RR(src,tLo));
+//..             addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
+//..             *rHi = tHi;
+//..             *rLo = tLo;
+//..             return;
+//..          }
+
+//..          /* could do better than this, but for now ... */
+//..          case Iop_1Sto64: {
+//..             HReg tLo = newVRegI(env);
+//..             HReg tHi = newVRegI(env);
+//..             X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+//..             addInstr(env, X86Instr_Set32(cond,tLo));
+//..             addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, X86RM_Reg(tLo)));
+//..             addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, X86RM_Reg(tLo)));
+//..             addInstr(env, mk_iMOVsd_RR(tLo, tHi));
+//..             *rHi = tHi;
+//..             *rLo = tLo;
+//..             return;
+//..          }
+//.. 
+//..          /* Not64(e) */
+//..          case Iop_Not64: {
+//..             HReg tLo = newVRegI(env);
+//..             HReg tHi = newVRegI(env);
+//..             HReg sHi, sLo;
+//..             iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg);
+//..             addInstr(env, mk_iMOVsd_RR(sHi, tHi));
+//..             addInstr(env, mk_iMOVsd_RR(sLo, tLo));
+//..             addInstr(env, X86Instr_Unary32(Xun_NOT,X86RM_Reg(tHi)));
+//..             addInstr(env, X86Instr_Unary32(Xun_NOT,X86RM_Reg(tLo)));
+//..             *rHi = tHi;
+//..             *rLo = tLo;
+//..             return;
+//..          }
+//.. 
+//..          default: 
+//..             break;
+//..       }
+//..    } /* if (e->tag == Iex_Unop) */
+//.. 
+//.. 
+//..    /* --------- CCALL --------- */
+//..    if (e->tag == Iex_CCall) {
+//..       HReg tLo = newVRegI(env);
+//..       HReg tHi = newVRegI(env);
+//.. 
+//..       /* Marshal args, do the call, clear stack. */
+//..       doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
+//.. 
+//..       addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
+//..       addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
+//..       *rHi = tHi;
+//..       *rLo = tLo;
+//..       return;
+//..    }
+
+   ppIRExpr(e);
+   vpanic("iselInt128Expr");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Floating point expressions (32 bit)         ---*/
+/*---------------------------------------------------------*/
+
+/* Nothing interesting here; really just wrappers for
+   64-bit stuff. */
+
+static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselFltExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcVec128);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_F32);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+      AMD64AMode* am;
+      HReg res = newVRegV(env);
+      vassert(e->Iex.Load.ty == Ity_F32);
+      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
+      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, res, am));
+      return res;
+   }
+
+   if (e->tag == Iex_Binop
+       && e->Iex.Binop.op == Iop_F64toF32) {
+      /* Although the result is still held in a standard SSE register,
+         we need to round it to reflect the loss of accuracy/range
+         entailed in casting it to a 32-bit float. */
+      HReg dst = newVRegV(env);
+      HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
+      set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
+      addInstr(env, AMD64Instr_SseSDSS(True/*D->S*/,src,dst));
+      set_SSE_rounding_default( env );
+      return dst;
+   }
+
+   if (e->tag == Iex_Get) {
+      AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
+                                       hregAMD64_RBP() );
+      HReg res = newVRegV(env);
+      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, res, am ));
+      return res;
+   }
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
+       /* Given an I32, produce an IEEE754 float with the same bit
+          pattern. */
+       HReg        dst    = newVRegV(env);
+       HReg        src    = iselIntExpr_R(env, e->Iex.Unop.arg);
+       AMD64AMode* m4_rsp = AMD64AMode_IR(-4, hregAMD64_RSP());
+       addInstr(env, AMD64Instr_Store(4, src, m4_rsp));
+       addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 4, dst, m4_rsp ));
+       return dst;
+   }
+
+   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
+      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
+      HReg        arg    = iselFltExpr(env, e->Iex.Binop.arg2);
+      HReg        dst    = newVRegV(env);
+
+      /* rf now holds the value to be rounded.  The first thing to do
+         is set the FPU's rounding mode accordingly. */
+
+      /* Set host x87 rounding mode */
+      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+
+      addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, arg, m8_rsp));
+      addInstr(env, AMD64Instr_A87Free(1));
+      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 4));
+      addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
+      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 4));
+      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 4, dst, m8_rsp));
+
+      /* Restore default x87 rounding. */
+      set_FPU_rounding_default( env );
+
+      return dst;
+   }
+
+   ppIRExpr(e);
+   vpanic("iselFltExpr_wrk");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Floating point expressions (64 bit)         ---*/
+/*---------------------------------------------------------*/
+
+/* Compute a 64-bit floating point value into the lower half of an xmm
+   register, the identity of which is returned.  As with
+   iselIntExpr_R, the returned reg will be virtual, and it must not be
+   changed by subsequent code emitted by the caller.
+*/
+
+/* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
+
+    Type                  S (1 bit)   E (11 bits)   F (52 bits)
+    ----                  ---------   -----------   -----------
+    signalling NaN        u           2047 (max)    .0uuuuu---u
+                                                    (with at least
+                                                     one 1 bit)
+    quiet NaN             u           2047 (max)    .1uuuuu---u
+
+    negative infinity     1           2047 (max)    .000000---0
+
+    positive infinity     0           2047 (max)    .000000---0
+
+    negative zero         1           0             .000000---0
+
+    positive zero         0           0             .000000---0
+*/
+
+static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselDblExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcVec128);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(e);
+   vassert(ty == Ity_F64);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Const) {
+      union { ULong u64; Double f64; } u;
+      HReg res = newVRegV(env);
+      HReg tmp = newVRegI(env);
+      vassert(sizeof(u) == 8);
+      vassert(sizeof(u.u64) == 8);
+      vassert(sizeof(u.f64) == 8);
+
+      if (e->Iex.Const.con->tag == Ico_F64) {
+         u.f64 = e->Iex.Const.con->Ico.F64;
+      }
+      else if (e->Iex.Const.con->tag == Ico_F64i) {
+         u.u64 = e->Iex.Const.con->Ico.F64i;
+      }
+      else
+         vpanic("iselDblExpr(amd64): const");
+
+      addInstr(env, AMD64Instr_Imm64(u.u64, tmp));
+      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(tmp)));
+      addInstr(env, AMD64Instr_SseLdSt(
+                       True/*load*/, 8, res, 
+                       AMD64AMode_IR(0, hregAMD64_RSP())
+              ));
+      add_to_rsp(env, 8);
+      return res;
+   }
+
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+      AMD64AMode* am;
+      HReg res = newVRegV(env);
+      vassert(e->Iex.Load.ty == Ity_F64);
+      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
+      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
+      return res;
+   }
+
+   if (e->tag == Iex_Get) {
+      AMD64AMode* am = AMD64AMode_IR( e->Iex.Get.offset,
+                                      hregAMD64_RBP() );
+      HReg res = newVRegV(env);
+      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
+      return res;
+   }
+
+   if (e->tag == Iex_GetI) {
+      AMD64AMode* am 
+         = genGuestArrayOffset(
+              env, e->Iex.GetI.descr, 
+                   e->Iex.GetI.ix, e->Iex.GetI.bias );
+      HReg res = newVRegV(env);
+      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 8, res, am ));
+      return res;
+   }
+
+   if (e->tag == Iex_Triop) {
+      AMD64SseOp op = Asse_INVALID;
+      switch (e->Iex.Triop.op) {
+         case Iop_AddF64: op = Asse_ADDF; break;
+         case Iop_SubF64: op = Asse_SUBF; break;
+         case Iop_MulF64: op = Asse_MULF; break;
+         case Iop_DivF64: op = Asse_DIVF; break;
+         default: break;
+      }
+      if (op != Asse_INVALID) {
+         HReg dst  = newVRegV(env);
+         HReg argL = iselDblExpr(env, e->Iex.Triop.arg2);
+         HReg argR = iselDblExpr(env, e->Iex.Triop.arg3);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         /* XXXROUNDINGFIXME */
+         /* set roundingmode here */
+         addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
+         return dst;
+      }
+   }
+
+   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
+      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
+      HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
+      HReg        dst    = newVRegV(env);
+
+      /* rf now holds the value to be rounded.  The first thing to do
+         is set the FPU's rounding mode accordingly. */
+
+      /* Set host x87 rounding mode */
+      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+
+      addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
+      addInstr(env, AMD64Instr_A87Free(1));
+      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
+      addInstr(env, AMD64Instr_A87FpOp(Afp_ROUND));
+      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
+      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
+
+      /* Restore default x87 rounding. */
+      set_FPU_rounding_default( env );
+
+      return dst;
+   }
+
+   if (e->tag == Iex_Triop 
+       && (e->Iex.Triop.op == Iop_ScaleF64
+           || e->Iex.Triop.op == Iop_AtanF64
+           || e->Iex.Triop.op == Iop_Yl2xF64
+           || e->Iex.Triop.op == Iop_Yl2xp1F64
+           || e->Iex.Triop.op == Iop_PRemF64
+           || e->Iex.Triop.op == Iop_PRem1F64)
+      ) {
+      AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
+      HReg        arg1   = iselDblExpr(env, e->Iex.Triop.arg2);
+      HReg        arg2   = iselDblExpr(env, e->Iex.Triop.arg3);
+      HReg        dst    = newVRegV(env);
+      Bool     arg2first = toBool(e->Iex.Triop.op == Iop_ScaleF64 
+                                  || e->Iex.Triop.op == Iop_PRemF64
+                                  || e->Iex.Triop.op == Iop_PRem1F64);
+      addInstr(env, AMD64Instr_A87Free(2));
+
+      /* one arg -> top of x87 stack */
+      addInstr(env, AMD64Instr_SseLdSt(
+                       False/*store*/, 8, arg2first ? arg2 : arg1, m8_rsp));
+      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
+
+      /* other arg -> top of x87 stack */
+      addInstr(env, AMD64Instr_SseLdSt(
+                       False/*store*/, 8, arg2first ? arg1 : arg2, m8_rsp));
+      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
+
+      /* do it */
+      /* XXXROUNDINGFIXME */
+      /* set roundingmode here */
+      switch (e->Iex.Triop.op) {
+         case Iop_ScaleF64: 
+            addInstr(env, AMD64Instr_A87FpOp(Afp_SCALE));
+            break;
+         case Iop_AtanF64: 
+            addInstr(env, AMD64Instr_A87FpOp(Afp_ATAN));
+            break;
+         case Iop_Yl2xF64: 
+            addInstr(env, AMD64Instr_A87FpOp(Afp_YL2X));
+            break;
+         case Iop_Yl2xp1F64: 
+            addInstr(env, AMD64Instr_A87FpOp(Afp_YL2XP1));
+            break;
+         case Iop_PRemF64:
+            addInstr(env, AMD64Instr_A87FpOp(Afp_PREM));
+            break;
+         case Iop_PRem1F64:
+            addInstr(env, AMD64Instr_A87FpOp(Afp_PREM1));
+            break;
+         default: 
+            vassert(0);
+      }
+
+      /* save result */
+      addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
+      addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
+      return dst;
+   }
+
+   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
+      HReg dst = newVRegV(env);
+      HReg src = iselIntExpr_R(env, e->Iex.Binop.arg2);
+      set_SSE_rounding_mode( env, e->Iex.Binop.arg1 );
+      addInstr(env, AMD64Instr_SseSI2SF( 8, 8, src, dst ));
+      set_SSE_rounding_default( env );
+      return dst;
+   }
+
+   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_I32StoF64) {
+      HReg dst = newVRegV(env);
+      HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+      set_SSE_rounding_default( env );
+      addInstr(env, AMD64Instr_SseSI2SF( 4, 8, src, dst ));
+      return dst;
+   }
+
+   if (e->tag == Iex_Unop 
+       && (e->Iex.Unop.op == Iop_NegF64
+           || e->Iex.Unop.op == Iop_AbsF64)) {
+      /* Sigh ... very rough code.  Could do much better. */
+      /* Get the 128-bit literal 00---0 10---0 into a register
+         and xor/nand it with the value to be negated. */
+      HReg r1  = newVRegI(env);
+      HReg dst = newVRegV(env);
+      HReg tmp = newVRegV(env);
+      HReg src = iselDblExpr(env, e->Iex.Unop.arg);
+      AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
+      addInstr(env, mk_vMOVsd_RR(src,tmp));
+      addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
+      addInstr(env, AMD64Instr_Imm64( 1ULL<<63, r1 ));
+      addInstr(env, AMD64Instr_Push(AMD64RMI_Reg(r1)));
+      addInstr(env, AMD64Instr_SseLdSt(True, 16, dst, rsp0));
+
+      if (e->Iex.Unop.op == Iop_NegF64)
+         addInstr(env, AMD64Instr_SseReRg(Asse_XOR, tmp, dst));
+      else
+         addInstr(env, AMD64Instr_SseReRg(Asse_ANDN, tmp, dst));
+
+      add_to_rsp(env, 16);
+      return dst;
+   }
+
+   if (e->tag == Iex_Binop) {
+      A87FpOp fpop = Afp_INVALID;
+      switch (e->Iex.Binop.op) {
+         case Iop_SqrtF64: fpop = Afp_SQRT; break;
+         case Iop_SinF64:  fpop = Afp_SIN;  break;
+         case Iop_CosF64:  fpop = Afp_COS;  break;
+         case Iop_TanF64:  fpop = Afp_TAN;  break;
+         case Iop_2xm1F64: fpop = Afp_2XM1; break;
+         default: break;
+      }
+      if (fpop != Afp_INVALID) {
+         AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
+         HReg        arg    = iselDblExpr(env, e->Iex.Binop.arg2);
+         HReg        dst    = newVRegV(env);
+         Int     nNeeded    = e->Iex.Binop.op==Iop_TanF64 ? 2 : 1;
+         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, arg, m8_rsp));
+         addInstr(env, AMD64Instr_A87Free(nNeeded));
+         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, True/*push*/, 8));
+         /* XXXROUNDINGFIXME */
+         /* set roundingmode here */
+         addInstr(env, AMD64Instr_A87FpOp(fpop));
+         if (e->Iex.Binop.op==Iop_TanF64) {
+            /* get rid of the extra 1.0 that fptan pushes */
+            addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
+         }
+         addInstr(env, AMD64Instr_A87PushPop(m8_rsp, False/*pop*/, 8));
+         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
+         return dst;
+      }
+   }
+
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+//..          case Iop_I32toF64: {
+//..             HReg dst = newVRegF(env);
+//..             HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
+//..             addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
+//..             set_FPU_rounding_default(env);
+//..             addInstr(env, X86Instr_FpLdStI(
+//..                              True/*load*/, 4, dst, 
+//..                              X86AMode_IR(0, hregX86_ESP())));
+//..             add_to_esp(env, 4);
+//..             return dst;
+//..          }
+         case Iop_ReinterpI64asF64: {
+            /* Given an I64, produce an IEEE754 double with the same
+               bit pattern. */
+            AMD64AMode* m8_rsp = AMD64AMode_IR(-8, hregAMD64_RSP());
+            HReg        dst    = newVRegV(env);
+            AMD64RI*    src    = iselIntExpr_RI(env, e->Iex.Unop.arg);
+            /* paranoia */
+            set_SSE_rounding_default(env);
+            addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, src, m8_rsp));
+            addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 8, dst, m8_rsp));
+            return dst;
+         }
+         case Iop_F32toF64: {
+            HReg f32;
+            HReg f64 = newVRegV(env);
+            /* this shouldn't be necessary, but be paranoid ... */
+            set_SSE_rounding_default(env);
+            f32 = iselFltExpr(env, e->Iex.Unop.arg);
+            addInstr(env, AMD64Instr_SseSDSS(False/*S->D*/, f32, f64));
+            return f64;
+         }
+         default: 
+            break;
+      }
+   }
+
+   /* --------- MULTIPLEX --------- */
+   if (e->tag == Iex_Mux0X) {
+      HReg r8, rX, r0, dst;
+      vassert(ty == Ity_F64);
+      vassert(typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8);
+      r8  = iselIntExpr_R(env, e->Iex.Mux0X.cond);
+      rX  = iselDblExpr(env, e->Iex.Mux0X.exprX);
+      r0  = iselDblExpr(env, e->Iex.Mux0X.expr0);
+      dst = newVRegV(env);
+      addInstr(env, mk_vMOVsd_RR(rX,dst));
+      addInstr(env, AMD64Instr_Test64(0xFF, r8));
+      addInstr(env, AMD64Instr_SseCMov(Acc_Z,r0,dst));
+      return dst;
+   }
+
+   ppIRExpr(e);
+   vpanic("iselDblExpr_wrk");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
+/*---------------------------------------------------------*/
+
+static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselVecExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcVec128);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   HWord      fn = 0; /* address of helper fn, if required */
+   Bool       arg1isEReg = False;
+   AMD64SseOp op = Asse_INVALID;
+   IRType     ty = typeOfIRExpr(env->type_env,e);
+   vassert(e);
+   vassert(ty == Ity_V128);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Get) {
+      HReg dst = newVRegV(env);
+      addInstr(env, AMD64Instr_SseLdSt(
+                       True/*load*/, 
+                       16,
+                       dst,
+                       AMD64AMode_IR(e->Iex.Get.offset, hregAMD64_RBP())
+                    )
+              );
+      return dst;
+   }
+
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+      HReg        dst = newVRegV(env);
+      AMD64AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
+      addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, am ));
+      return dst;
+   }
+
+   if (e->tag == Iex_Const) {
+      HReg dst = newVRegV(env);
+      vassert(e->Iex.Const.con->tag == Ico_V128);
+      switch (e->Iex.Const.con->Ico.V128) {
+         case 0x0000:
+            dst = generate_zeroes_V128(env);
+            break;
+         case 0xFFFF:
+            dst = generate_ones_V128(env);
+            break;
+         default: {
+            AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
+            /* do push_uimm64 twice, first time for the high-order half. */
+            push_uimm64(env, bitmask8_to_bytemask64(
+                                (e->Iex.Const.con->Ico.V128 >> 8) & 0xFF
+                       ));
+            push_uimm64(env, bitmask8_to_bytemask64(
+                                (e->Iex.Const.con->Ico.V128 >> 0) & 0xFF
+                       ));
+            addInstr(env, AMD64Instr_SseLdSt( True/*load*/, 16, dst, rsp0 ));
+            add_to_rsp(env, 16);
+            break;
+         }
+      }
+      return dst;
+   }
+
+   if (e->tag == Iex_Unop) {
+   switch (e->Iex.Unop.op) {
+
+      case Iop_NotV128: {
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         return do_sse_NotV128(env, arg);
+      }
+
+      case Iop_CmpNEZ64x2: {
+         /* We can use SSE2 instructions for this. */
+         /* Ideally, we want to do a 64Ix2 comparison against zero of
+            the operand.  Problem is no such insn exists.  Solution
+            therefore is to do a 32Ix4 comparison instead, and bitwise-
+            negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and 
+            let the not'd result of this initial comparison be a:b:c:d.
+            What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
+            pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
+            giving the required result.
+
+            The required selection sequence is 2,3,0,1, which
+            according to Intel's documentation means the pshufd
+            literal value is 0xB1, that is, 
+            (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0) 
+         */
+         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg tmp  = generate_zeroes_V128(env);
+         HReg dst  = newVRegV(env);
+         addInstr(env, AMD64Instr_SseReRg(Asse_CMPEQ32, arg, tmp));
+         tmp = do_sse_NotV128(env, tmp);
+         addInstr(env, AMD64Instr_SseShuf(0xB1, tmp, dst));
+         addInstr(env, AMD64Instr_SseReRg(Asse_OR, tmp, dst));
+         return dst;
+      }
+
+      case Iop_CmpNEZ32x4: op = Asse_CMPEQ32; goto do_CmpNEZ_vector;
+      case Iop_CmpNEZ16x8: op = Asse_CMPEQ16; goto do_CmpNEZ_vector;
+      case Iop_CmpNEZ8x16: op = Asse_CMPEQ8;  goto do_CmpNEZ_vector;
+      do_CmpNEZ_vector:
+      {
+         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg tmp  = newVRegV(env);
+         HReg zero = generate_zeroes_V128(env);
+         HReg dst;
+         addInstr(env, mk_vMOVsd_RR(arg, tmp));
+         addInstr(env, AMD64Instr_SseReRg(op, zero, tmp));
+         dst = do_sse_NotV128(env, tmp);
+         return dst;
+      }
+
+      case Iop_Recip32Fx4: op = Asse_RCPF;   goto do_32Fx4_unary;
+      case Iop_RSqrt32Fx4: op = Asse_RSQRTF; goto do_32Fx4_unary;
+      case Iop_Sqrt32Fx4:  op = Asse_SQRTF;  goto do_32Fx4_unary;
+      do_32Fx4_unary:
+      {
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg dst = newVRegV(env);
+         addInstr(env, AMD64Instr_Sse32Fx4(op, arg, dst));
+         return dst;
+      }
+
+//..       case Iop_Recip64Fx2: op = Xsse_RCPF;   goto do_64Fx2_unary;
+//..       case Iop_RSqrt64Fx2: op = Asse_RSQRTF; goto do_64Fx2_unary;
+      case Iop_Sqrt64Fx2:  op = Asse_SQRTF;  goto do_64Fx2_unary;
+      do_64Fx2_unary:
+      {
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg dst = newVRegV(env);
+         addInstr(env, AMD64Instr_Sse64Fx2(op, arg, dst));
+         return dst;
+      }
+
+      case Iop_Recip32F0x4: op = Asse_RCPF;   goto do_32F0x4_unary;
+      case Iop_RSqrt32F0x4: op = Asse_RSQRTF; goto do_32F0x4_unary;
+      case Iop_Sqrt32F0x4:  op = Asse_SQRTF;  goto do_32F0x4_unary;
+      do_32F0x4_unary:
+      {
+         /* A bit subtle.  We have to copy the arg to the result
+            register first, because actually doing the SSE scalar insn
+            leaves the upper 3/4 of the destination register
+            unchanged.  Whereas the required semantics of these
+            primops is that the upper 3/4 is simply copied in from the
+            argument. */
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(arg, dst));
+         addInstr(env, AMD64Instr_Sse32FLo(op, arg, dst));
+         return dst;
+      }
+
+//..       case Iop_Recip64F0x2: op = Xsse_RCPF;   goto do_64F0x2_unary;
+//..       case Iop_RSqrt64F0x2: op = Xsse_RSQRTF; goto do_64F0x2_unary;
+      case Iop_Sqrt64F0x2:  op = Asse_SQRTF;  goto do_64F0x2_unary;
+      do_64F0x2_unary:
+      {
+         /* A bit subtle.  We have to copy the arg to the result
+            register first, because actually doing the SSE scalar insn
+            leaves the upper half of the destination register
+            unchanged.  Whereas the required semantics of these
+            primops is that the upper half is simply copied in from the
+            argument. */
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(arg, dst));
+         addInstr(env, AMD64Instr_Sse64FLo(op, arg, dst));
+         return dst;
+      }
+
+      case Iop_32UtoV128: {
+         HReg        dst     = newVRegV(env);
+         AMD64AMode* rsp_m32 = AMD64AMode_IR(-32, hregAMD64_RSP());
+         AMD64RI*    ri      = iselIntExpr_RI(env, e->Iex.Unop.arg);
+         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, ri, rsp_m32));
+         addInstr(env, AMD64Instr_SseLdzLO(4, dst, rsp_m32));
+         return dst;
+      }
+
+      case Iop_64UtoV128: {
+         HReg        dst  = newVRegV(env);
+         AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
+         AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
+         addInstr(env, AMD64Instr_Push(rmi));
+         addInstr(env, AMD64Instr_SseLdzLO(8, dst, rsp0));
+         add_to_rsp(env, 8);
+         return dst;
+      }
+
+      default:
+         break;
+   } /* switch (e->Iex.Unop.op) */
+   } /* if (e->tag == Iex_Unop) */
+
+   if (e->tag == Iex_Binop) {
+   switch (e->Iex.Binop.op) {
+
+      case Iop_SetV128lo64: {
+         HReg dst  = newVRegV(env);
+         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
+         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
+         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV, AMD64RI_Reg(srcI), rsp_m16));
+         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
+         return dst;
+      }
+
+      case Iop_SetV128lo32: {
+         HReg dst  = newVRegV(env);
+         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         AMD64AMode* rsp_m16 = AMD64AMode_IR(-16, hregAMD64_RSP());
+         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, srcV, rsp_m16));
+         addInstr(env, AMD64Instr_Store(4, srcI, rsp_m16));
+         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp_m16));
+         return dst;
+      }
+
+      case Iop_64HLtoV128: {
+         AMD64AMode* rsp = AMD64AMode_IR(0, hregAMD64_RSP());
+         HReg        dst = newVRegV(env);
+         /* do this via the stack (easy, convenient, etc) */
+         addInstr(env, AMD64Instr_Push(iselIntExpr_RMI(env, e->Iex.Binop.arg1)));
+         addInstr(env, AMD64Instr_Push(iselIntExpr_RMI(env, e->Iex.Binop.arg2)));
+         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, dst, rsp));
+         add_to_rsp(env, 16);
+         return dst;
+      }
+
+      case Iop_CmpEQ32Fx4: op = Asse_CMPEQF; goto do_32Fx4;
+      case Iop_CmpLT32Fx4: op = Asse_CMPLTF; goto do_32Fx4;
+      case Iop_CmpLE32Fx4: op = Asse_CMPLEF; goto do_32Fx4;
+      case Iop_CmpUN32Fx4: op = Asse_CMPUNF; goto do_32Fx4;
+      case Iop_Add32Fx4:   op = Asse_ADDF;   goto do_32Fx4;
+      case Iop_Div32Fx4:   op = Asse_DIVF;   goto do_32Fx4;
+      case Iop_Max32Fx4:   op = Asse_MAXF;   goto do_32Fx4;
+      case Iop_Min32Fx4:   op = Asse_MINF;   goto do_32Fx4;
+      case Iop_Mul32Fx4:   op = Asse_MULF;   goto do_32Fx4;
+      case Iop_Sub32Fx4:   op = Asse_SUBF;   goto do_32Fx4;
+      do_32Fx4:
+      {
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         addInstr(env, AMD64Instr_Sse32Fx4(op, argR, dst));
+         return dst;
+      }
+
+      case Iop_CmpEQ64Fx2: op = Asse_CMPEQF; goto do_64Fx2;
+      case Iop_CmpLT64Fx2: op = Asse_CMPLTF; goto do_64Fx2;
+      case Iop_CmpLE64Fx2: op = Asse_CMPLEF; goto do_64Fx2;
+      case Iop_CmpUN64Fx2: op = Asse_CMPUNF; goto do_64Fx2;
+      case Iop_Add64Fx2:   op = Asse_ADDF;   goto do_64Fx2;
+      case Iop_Div64Fx2:   op = Asse_DIVF;   goto do_64Fx2;
+      case Iop_Max64Fx2:   op = Asse_MAXF;   goto do_64Fx2;
+      case Iop_Min64Fx2:   op = Asse_MINF;   goto do_64Fx2;
+      case Iop_Mul64Fx2:   op = Asse_MULF;   goto do_64Fx2;
+      case Iop_Sub64Fx2:   op = Asse_SUBF;   goto do_64Fx2;
+      do_64Fx2:
+      {
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         addInstr(env, AMD64Instr_Sse64Fx2(op, argR, dst));
+         return dst;
+      }
+
+      case Iop_CmpEQ32F0x4: op = Asse_CMPEQF; goto do_32F0x4;
+      case Iop_CmpLT32F0x4: op = Asse_CMPLTF; goto do_32F0x4;
+      case Iop_CmpLE32F0x4: op = Asse_CMPLEF; goto do_32F0x4;
+      case Iop_CmpUN32F0x4: op = Asse_CMPUNF; goto do_32F0x4;
+      case Iop_Add32F0x4:   op = Asse_ADDF;   goto do_32F0x4;
+      case Iop_Div32F0x4:   op = Asse_DIVF;   goto do_32F0x4;
+      case Iop_Max32F0x4:   op = Asse_MAXF;   goto do_32F0x4;
+      case Iop_Min32F0x4:   op = Asse_MINF;   goto do_32F0x4;
+      case Iop_Mul32F0x4:   op = Asse_MULF;   goto do_32F0x4;
+      case Iop_Sub32F0x4:   op = Asse_SUBF;   goto do_32F0x4;
+      do_32F0x4: {
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         addInstr(env, AMD64Instr_Sse32FLo(op, argR, dst));
+         return dst;
+      }
+
+      case Iop_CmpEQ64F0x2: op = Asse_CMPEQF; goto do_64F0x2;
+      case Iop_CmpLT64F0x2: op = Asse_CMPLTF; goto do_64F0x2;
+      case Iop_CmpLE64F0x2: op = Asse_CMPLEF; goto do_64F0x2;
+      case Iop_CmpUN64F0x2: op = Asse_CMPUNF; goto do_64F0x2;
+      case Iop_Add64F0x2:   op = Asse_ADDF;   goto do_64F0x2;
+      case Iop_Div64F0x2:   op = Asse_DIVF;   goto do_64F0x2;
+      case Iop_Max64F0x2:   op = Asse_MAXF;   goto do_64F0x2;
+      case Iop_Min64F0x2:   op = Asse_MINF;   goto do_64F0x2;
+      case Iop_Mul64F0x2:   op = Asse_MULF;   goto do_64F0x2;
+      case Iop_Sub64F0x2:   op = Asse_SUBF;   goto do_64F0x2;
+      do_64F0x2: {
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         addInstr(env, AMD64Instr_Sse64FLo(op, argR, dst));
+         return dst;
+      }
+
+      case Iop_QNarrow32Sx4: 
+         op = Asse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
+      case Iop_QNarrow16Sx8: 
+         op = Asse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
+      case Iop_QNarrow16Ux8: 
+         op = Asse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
+
+      case Iop_InterleaveHI8x16: 
+         op = Asse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
+      case Iop_InterleaveHI16x8: 
+         op = Asse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
+      case Iop_InterleaveHI32x4: 
+         op = Asse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
+      case Iop_InterleaveHI64x2: 
+         op = Asse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
+
+      case Iop_InterleaveLO8x16: 
+         op = Asse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
+      case Iop_InterleaveLO16x8: 
+         op = Asse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
+      case Iop_InterleaveLO32x4: 
+         op = Asse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
+      case Iop_InterleaveLO64x2: 
+         op = Asse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
+
+      case Iop_AndV128:    op = Asse_AND;      goto do_SseReRg;
+      case Iop_OrV128:     op = Asse_OR;       goto do_SseReRg;
+      case Iop_XorV128:    op = Asse_XOR;      goto do_SseReRg;
+      case Iop_Add8x16:    op = Asse_ADD8;     goto do_SseReRg;
+      case Iop_Add16x8:    op = Asse_ADD16;    goto do_SseReRg;
+      case Iop_Add32x4:    op = Asse_ADD32;    goto do_SseReRg;
+      case Iop_Add64x2:    op = Asse_ADD64;    goto do_SseReRg;
+      case Iop_QAdd8Sx16:  op = Asse_QADD8S;   goto do_SseReRg;
+      case Iop_QAdd16Sx8:  op = Asse_QADD16S;  goto do_SseReRg;
+      case Iop_QAdd8Ux16:  op = Asse_QADD8U;   goto do_SseReRg;
+      case Iop_QAdd16Ux8:  op = Asse_QADD16U;  goto do_SseReRg;
+      case Iop_Avg8Ux16:   op = Asse_AVG8U;    goto do_SseReRg;
+      case Iop_Avg16Ux8:   op = Asse_AVG16U;   goto do_SseReRg;
+      case Iop_CmpEQ8x16:  op = Asse_CMPEQ8;   goto do_SseReRg;
+      case Iop_CmpEQ16x8:  op = Asse_CMPEQ16;  goto do_SseReRg;
+      case Iop_CmpEQ32x4:  op = Asse_CMPEQ32;  goto do_SseReRg;
+      case Iop_CmpGT8Sx16: op = Asse_CMPGT8S;  goto do_SseReRg;
+      case Iop_CmpGT16Sx8: op = Asse_CMPGT16S; goto do_SseReRg;
+      case Iop_CmpGT32Sx4: op = Asse_CMPGT32S; goto do_SseReRg;
+      case Iop_Max16Sx8:   op = Asse_MAX16S;   goto do_SseReRg;
+      case Iop_Max8Ux16:   op = Asse_MAX8U;    goto do_SseReRg;
+      case Iop_Min16Sx8:   op = Asse_MIN16S;   goto do_SseReRg;
+      case Iop_Min8Ux16:   op = Asse_MIN8U;    goto do_SseReRg;
+      case Iop_MulHi16Ux8: op = Asse_MULHI16U; goto do_SseReRg;
+      case Iop_MulHi16Sx8: op = Asse_MULHI16S; goto do_SseReRg;
+      case Iop_Mul16x8:    op = Asse_MUL16;    goto do_SseReRg;
+      case Iop_Sub8x16:    op = Asse_SUB8;     goto do_SseReRg;
+      case Iop_Sub16x8:    op = Asse_SUB16;    goto do_SseReRg;
+      case Iop_Sub32x4:    op = Asse_SUB32;    goto do_SseReRg;
+      case Iop_Sub64x2:    op = Asse_SUB64;    goto do_SseReRg;
+      case Iop_QSub8Sx16:  op = Asse_QSUB8S;   goto do_SseReRg;
+      case Iop_QSub16Sx8:  op = Asse_QSUB16S;  goto do_SseReRg;
+      case Iop_QSub8Ux16:  op = Asse_QSUB8U;   goto do_SseReRg;
+      case Iop_QSub16Ux8:  op = Asse_QSUB16U;  goto do_SseReRg;
+      do_SseReRg: {
+         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         if (arg1isEReg) {
+            addInstr(env, mk_vMOVsd_RR(arg2, dst));
+            addInstr(env, AMD64Instr_SseReRg(op, arg1, dst));
+         } else {
+            addInstr(env, mk_vMOVsd_RR(arg1, dst));
+            addInstr(env, AMD64Instr_SseReRg(op, arg2, dst));
+         }
+         return dst;
+      }
+
+      case Iop_ShlN16x8: op = Asse_SHL16; goto do_SseShift;
+      case Iop_ShlN32x4: op = Asse_SHL32; goto do_SseShift;
+      case Iop_ShlN64x2: op = Asse_SHL64; goto do_SseShift;
+      case Iop_SarN16x8: op = Asse_SAR16; goto do_SseShift;
+      case Iop_SarN32x4: op = Asse_SAR32; goto do_SseShift;
+      case Iop_ShrN16x8: op = Asse_SHR16; goto do_SseShift;
+      case Iop_ShrN32x4: op = Asse_SHR32; goto do_SseShift;
+      case Iop_ShrN64x2: op = Asse_SHR64; goto do_SseShift;
+      do_SseShift: {
+         HReg        greg = iselVecExpr(env, e->Iex.Binop.arg1);
+         AMD64RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+         AMD64AMode* rsp0 = AMD64AMode_IR(0, hregAMD64_RSP());
+         HReg        ereg = newVRegV(env);
+         HReg        dst  = newVRegV(env);
+         addInstr(env, AMD64Instr_Push(AMD64RMI_Imm(0)));
+         addInstr(env, AMD64Instr_Push(rmi));
+         addInstr(env, AMD64Instr_SseLdSt(True/*load*/, 16, ereg, rsp0));
+         addInstr(env, mk_vMOVsd_RR(greg, dst));
+         addInstr(env, AMD64Instr_SseReRg(op, ereg, dst));
+         add_to_rsp(env, 16);
+         return dst;
+      }
+
+      case Iop_Mul32x4:    fn = (HWord)h_generic_calc_Mul32x4;
+                           goto do_SseAssistedBinary;
+      case Iop_Max32Sx4:   fn = (HWord)h_generic_calc_Max32Sx4;
+                           goto do_SseAssistedBinary;
+      case Iop_Min32Sx4:   fn = (HWord)h_generic_calc_Min32Sx4;
+                           goto do_SseAssistedBinary;
+      case Iop_Max32Ux4:   fn = (HWord)h_generic_calc_Max32Ux4;
+                           goto do_SseAssistedBinary;
+      case Iop_Min32Ux4:   fn = (HWord)h_generic_calc_Min32Ux4;
+                           goto do_SseAssistedBinary;
+      case Iop_Max16Ux8:   fn = (HWord)h_generic_calc_Max16Ux8;
+                           goto do_SseAssistedBinary;
+      case Iop_Min16Ux8:   fn = (HWord)h_generic_calc_Min16Ux8;
+                           goto do_SseAssistedBinary;
+      case Iop_Max8Sx16:   fn = (HWord)h_generic_calc_Max8Sx16;
+                           goto do_SseAssistedBinary;
+      case Iop_Min8Sx16:   fn = (HWord)h_generic_calc_Min8Sx16;
+                           goto do_SseAssistedBinary;
+      case Iop_CmpGT64Sx2: fn = (HWord)h_generic_calc_CmpGT64Sx2;
+                           goto do_SseAssistedBinary;
+      do_SseAssistedBinary: {
+         /* RRRufff!  RRRufff code is what we're generating here.  Oh
+            well. */
+         vassert(fn != 0);
+         HReg dst = newVRegV(env);
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg argp = newVRegI(env);
+         /* subq $112, %rsp         -- make a space*/
+         sub_from_rsp(env, 112);
+         /* leaq 48(%rsp), %r_argp  -- point into it */
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(48, hregAMD64_RSP()),
+                                        argp));
+         /* andq $-16, %r_argp      -- 16-align the pointer */
+         addInstr(env, AMD64Instr_Alu64R(Aalu_AND,
+                                         AMD64RMI_Imm( ~(UInt)15 ), 
+                                         argp));
+         /* Prepare 3 arg regs:
+            leaq 0(%r_argp), %rdi
+            leaq 16(%r_argp), %rsi
+            leaq 32(%r_argp), %rdx
+         */
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(0, argp),
+                                        hregAMD64_RDI()));
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(16, argp),
+                                        hregAMD64_RSI()));
+         addInstr(env, AMD64Instr_Lea64(AMD64AMode_IR(32, argp),
+                                        hregAMD64_RDX()));
+         /* Store the two args, at (%rsi) and (%rdx):
+            movupd  %argL, 0(%rsi)
+            movupd  %argR, 0(%rdx)
+         */
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argL,
+                                          AMD64AMode_IR(0, hregAMD64_RSI())));
+         addInstr(env, AMD64Instr_SseLdSt(False/*!isLoad*/, 16, argR,
+                                          AMD64AMode_IR(0, hregAMD64_RDX())));
+         /* call the helper */
+         addInstr(env, AMD64Instr_Call( Acc_ALWAYS, (ULong)fn, 3 ));
+         /* fetch the result from memory, using %r_argp, which the
+            register allocator will keep alive across the call. */
+         addInstr(env, AMD64Instr_SseLdSt(True/*isLoad*/, 16, dst,
+                                          AMD64AMode_IR(0, argp)));
+         /* and finally, clear the space */
+         add_to_rsp(env, 112);
+         return dst;
+      }
+
+      default:
+         break;
+   } /* switch (e->Iex.Binop.op) */
+   } /* if (e->tag == Iex_Binop) */
+
+   if (e->tag == Iex_Mux0X) {
+      HReg r8  = iselIntExpr_R(env, e->Iex.Mux0X.cond);
+      HReg rX  = iselVecExpr(env, e->Iex.Mux0X.exprX);
+      HReg r0  = iselVecExpr(env, e->Iex.Mux0X.expr0);
+      HReg dst = newVRegV(env);
+      addInstr(env, mk_vMOVsd_RR(rX,dst));
+      addInstr(env, AMD64Instr_Test64(0xFF, r8));
+      addInstr(env, AMD64Instr_SseCMov(Acc_Z,r0,dst));
+      return dst;
+   }
+
+   //vec_fail:
+   vex_printf("iselVecExpr (amd64, subarch = %s): can't reduce\n",
+              LibVEX_ppVexHwCaps(VexArchAMD64, env->hwcaps));
+   ppIRExpr(e);
+   vpanic("iselVecExpr_wrk");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Statements                                  ---*/
+/*---------------------------------------------------------*/
+
+static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+{
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      vex_printf("\n-- ");
+      ppIRStmt(stmt);
+      vex_printf("\n");
+   }
+
+   switch (stmt->tag) {
+
+   /* --------- STORE --------- */
+   case Ist_Store: {
+      IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
+      IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
+      IREndness end   = stmt->Ist.Store.end;
+
+      if (tya != Ity_I64 || end != Iend_LE)
+         goto stmt_fail;
+
+      if (tyd == Ity_I64) {
+         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
+         AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
+         addInstr(env, AMD64Instr_Alu64M(Aalu_MOV,ri,am));
+         return;
+      }
+      if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32) {
+         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
+         HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
+         addInstr(env, AMD64Instr_Store(
+                          toUChar(tyd==Ity_I8 ? 1 : (tyd==Ity_I16 ? 2 : 4)),
+                          r,am));
+         return;
+      }
+      if (tyd == Ity_F64) {
+         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
+         HReg r = iselDblExpr(env, stmt->Ist.Store.data);
+         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 8, r, am));
+         return;
+      }
+      if (tyd == Ity_F32) {
+         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
+         HReg r = iselFltExpr(env, stmt->Ist.Store.data);
+         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 4, r, am));
+         return;
+      }
+      if (tyd == Ity_V128) {
+         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
+         HReg r = iselVecExpr(env, stmt->Ist.Store.data);
+         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, r, am));
+         return;
+      }
+      break;
+   }
+
+   /* --------- PUT --------- */
+   case Ist_Put: {
+      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
+      if (ty == Ity_I64) {
+         /* We're going to write to memory, so compute the RHS into an
+            AMD64RI. */
+         AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
+         addInstr(env,
+                  AMD64Instr_Alu64M(
+                     Aalu_MOV,
+                     ri,
+                     AMD64AMode_IR(stmt->Ist.Put.offset,
+                                   hregAMD64_RBP())
+                 ));
+         return;
+      }
+      if (ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32) {
+         HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
+         addInstr(env, AMD64Instr_Store(
+                          toUChar(ty==Ity_I8 ? 1 : (ty==Ity_I16 ? 2 : 4)),
+                          r,
+                          AMD64AMode_IR(stmt->Ist.Put.offset,
+                                        hregAMD64_RBP())));
+         return;
+      }
+      if (ty == Ity_V128) {
+         HReg        vec = iselVecExpr(env, stmt->Ist.Put.data);
+         AMD64AMode* am  = AMD64AMode_IR(stmt->Ist.Put.offset, 
+                                         hregAMD64_RBP());
+         addInstr(env, AMD64Instr_SseLdSt(False/*store*/, 16, vec, am));
+         return;
+      }
+      if (ty == Ity_F32) {
+         HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
+         AMD64AMode* am = AMD64AMode_IR(stmt->Ist.Put.offset, hregAMD64_RBP());
+         set_SSE_rounding_default(env); /* paranoia */
+         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 4, f32, am ));
+         return;
+      }
+      if (ty == Ity_F64) {
+         HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
+         AMD64AMode* am = AMD64AMode_IR( stmt->Ist.Put.offset, 
+                                         hregAMD64_RBP() );
+         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, f64, am ));
+         return;
+      }
+      break;
+   }
+
+   /* --------- Indexed PUT --------- */
+   case Ist_PutI: {
+      AMD64AMode* am 
+         = genGuestArrayOffset(
+              env, stmt->Ist.PutI.descr, 
+                   stmt->Ist.PutI.ix, stmt->Ist.PutI.bias );
+
+      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.PutI.data);
+      if (ty == Ity_F64) {
+         HReg val = iselDblExpr(env, stmt->Ist.PutI.data);
+         addInstr(env, AMD64Instr_SseLdSt( False/*store*/, 8, val, am ));
+         return;
+      }
+      if (ty == Ity_I8) {
+         HReg r = iselIntExpr_R(env, stmt->Ist.PutI.data);
+         addInstr(env, AMD64Instr_Store( 1, r, am ));
+         return;
+      }
+      if (ty == Ity_I64) {
+         AMD64RI* ri = iselIntExpr_RI(env, stmt->Ist.PutI.data);
+         addInstr(env, AMD64Instr_Alu64M( Aalu_MOV, ri, am ));
+         return;
+      }
+      break;
+   }
+
+   /* --------- TMP --------- */
+   case Ist_WrTmp: {
+      IRTemp tmp = stmt->Ist.WrTmp.tmp;
+      IRType ty = typeOfIRTemp(env->type_env, tmp);
+
+      /* optimisation: if stmt->Ist.WrTmp.data is Add64(..,..),
+         compute it into an AMode and then use LEA.  This usually
+         produces fewer instructions, often because (for memcheck
+         created IR) we get t = address-expression, (t is later used
+         twice) and so doing this naturally turns address-expression
+         back into an AMD64 amode. */
+      if (ty == Ity_I64 
+          && stmt->Ist.WrTmp.data->tag == Iex_Binop
+          && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add64) {
+         AMD64AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
+         HReg dst = lookupIRTemp(env, tmp);
+         if (am->tag == Aam_IR && am->Aam.IR.imm == 0) {
+            /* Hmm, iselIntExpr_AMode wimped out and just computed the
+               value into a register.  Just emit a normal reg-reg move
+               so reg-alloc can coalesce it away in the usual way. */
+            HReg src = am->Aam.IR.reg;
+            addInstr(env, AMD64Instr_Alu64R(Aalu_MOV, AMD64RMI_Reg(src), dst));
+         } else {
+            addInstr(env, AMD64Instr_Lea64(am,dst));
+         }
+         return;
+      }
+
+      if (ty == Ity_I64 || ty == Ity_I32 
+          || ty == Ity_I16 || ty == Ity_I8) {
+         AMD64RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
+         HReg dst = lookupIRTemp(env, tmp);
+         addInstr(env, AMD64Instr_Alu64R(Aalu_MOV,rmi,dst));
+         return;
+      }
+      if (ty == Ity_I128) {
+         HReg rHi, rLo, dstHi, dstLo;
+         iselInt128Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
+         lookupIRTemp128( &dstHi, &dstLo, env, tmp);
+         addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
+         addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
+         return;
+      }
+      if (ty == Ity_I1) {
+         AMD64CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
+         HReg dst = lookupIRTemp(env, tmp);
+         addInstr(env, AMD64Instr_Set64(cond, dst));
+         return;
+      }
+      if (ty == Ity_F64) {
+         HReg dst = lookupIRTemp(env, tmp);
+         HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
+         addInstr(env, mk_vMOVsd_RR(src, dst));
+         return;
+      }
+      if (ty == Ity_F32) {
+         HReg dst = lookupIRTemp(env, tmp);
+         HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
+         addInstr(env, mk_vMOVsd_RR(src, dst));
+         return;
+      }
+      if (ty == Ity_V128) {
+         HReg dst = lookupIRTemp(env, tmp);
+         HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
+         addInstr(env, mk_vMOVsd_RR(src, dst));
+         return;
+      }
+      break;
+   }
+
+   /* --------- Call to DIRTY helper --------- */
+   case Ist_Dirty: {
+      IRType   retty;
+      IRDirty* d = stmt->Ist.Dirty.details;
+      Bool     passBBP = False;
+
+      if (d->nFxState == 0)
+         vassert(!d->needsBBP);
+
+      passBBP = toBool(d->nFxState > 0 && d->needsBBP);
+
+      /* Marshal args, do the call, clear stack. */
+      doHelperCall( env, passBBP, d->guard, d->cee, d->args );
+
+      /* Now figure out what to do with the returned value, if any. */
+      if (d->tmp == IRTemp_INVALID)
+         /* No return value.  Nothing to do. */
+         return;
+
+      retty = typeOfIRTemp(env->type_env, d->tmp);
+      if (retty == Ity_I64 || retty == Ity_I32 
+          || retty == Ity_I16 || retty == Ity_I8) {
+         /* The returned value is in %rax.  Park it in the register
+            associated with tmp. */
+         HReg dst = lookupIRTemp(env, d->tmp);
+         addInstr(env, mk_iMOVsd_RR(hregAMD64_RAX(),dst) );
+         return;
+      }
+      break;
+   }
+
+   /* --------- MEM FENCE --------- */
+   case Ist_MBE:
+      switch (stmt->Ist.MBE.event) {
+         case Imbe_Fence:
+            addInstr(env, AMD64Instr_MFence());
+            return;
+         default:
+            break;
+      }
+      break;
+
+   /* --------- ACAS --------- */
+   case Ist_CAS:
+      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
+         /* "normal" singleton CAS */
+         UChar  sz;
+         IRCAS* cas = stmt->Ist.CAS.details;
+         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
+         /* get: cas->expd into %rax, and cas->data into %rbx */
+         AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
+         HReg rData = iselIntExpr_R(env, cas->dataLo);
+         HReg rExpd = iselIntExpr_R(env, cas->expdLo);
+         HReg rOld  = lookupIRTemp(env, cas->oldLo);
+         vassert(cas->expdHi == NULL);
+         vassert(cas->dataHi == NULL);
+         addInstr(env, mk_iMOVsd_RR(rExpd, rOld));
+         addInstr(env, mk_iMOVsd_RR(rExpd, hregAMD64_RAX()));
+         addInstr(env, mk_iMOVsd_RR(rData, hregAMD64_RBX()));
+         switch (ty) { 
+            case Ity_I64: sz = 8; break;
+            case Ity_I32: sz = 4; break;
+            case Ity_I16: sz = 2; break;
+            case Ity_I8:  sz = 1; break; 
+            default: goto unhandled_cas;
+         }
+         addInstr(env, AMD64Instr_ACAS(am, sz));
+         addInstr(env, AMD64Instr_CMov64(
+                          Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOld));
+         return;
+      } else {
+         /* double CAS */
+         UChar  sz;
+         IRCAS* cas = stmt->Ist.CAS.details;
+         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
+         /* only 32-bit and 64-bit allowed in this case */
+         /* get: cas->expdLo into %rax, and cas->dataLo into %rbx */
+         /* get: cas->expdHi into %rdx, and cas->dataHi into %rcx */
+         AMD64AMode* am = iselIntExpr_AMode(env, cas->addr);
+         HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
+         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
+         HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
+         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
+         HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
+         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
+         switch (ty) { 
+            case Ity_I64:
+               if (!(env->hwcaps & VEX_HWCAPS_AMD64_CX16))
+                  goto unhandled_cas; /* we'd have to generate
+                                         cmpxchg16b, but the host
+                                         doesn't support that */
+               sz = 8;
+               break;
+            case Ity_I32:
+               sz = 4;
+               break;
+            default:
+               goto unhandled_cas;
+         }
+         addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
+         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
+         addInstr(env, mk_iMOVsd_RR(rExpdHi, hregAMD64_RDX()));
+         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregAMD64_RAX()));
+         addInstr(env, mk_iMOVsd_RR(rDataHi, hregAMD64_RCX()));
+         addInstr(env, mk_iMOVsd_RR(rDataLo, hregAMD64_RBX()));
+         addInstr(env, AMD64Instr_DACAS(am, sz));
+         addInstr(env,
+                  AMD64Instr_CMov64(
+                     Acc_NZ, AMD64RM_Reg(hregAMD64_RDX()), rOldHi));
+         addInstr(env,
+                  AMD64Instr_CMov64(
+                     Acc_NZ, AMD64RM_Reg(hregAMD64_RAX()), rOldLo));
+         return;
+      }
+      unhandled_cas:
+      break;
+
+   /* --------- INSTR MARK --------- */
+   /* Doesn't generate any executable code ... */
+   case Ist_IMark:
+       return;
+
+   /* --------- ABI HINT --------- */
+   /* These have no meaning (denotation in the IR) and so we ignore
+      them ... if any actually made it this far. */
+   case Ist_AbiHint:
+       return;
+
+   /* --------- NO-OP --------- */
+   case Ist_NoOp:
+       return;
+
+   /* --------- EXIT --------- */
+   case Ist_Exit: {
+      AMD64RI*      dst;
+      AMD64CondCode cc;
+      if (stmt->Ist.Exit.dst->tag != Ico_U64)
+         vpanic("iselStmt(amd64): Ist_Exit: dst is not a 64-bit value");
+      dst = iselIntExpr_RI(env, IRExpr_Const(stmt->Ist.Exit.dst));
+      cc  = iselCondCode(env,stmt->Ist.Exit.guard);
+      addInstr(env, AMD64Instr_Goto(stmt->Ist.Exit.jk, cc, dst));
+      return;
+   }
+
+   default: break;
+   }
+  stmt_fail:
+   ppIRStmt(stmt);
+   vpanic("iselStmt(amd64)");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Basic block terminators (Nexts)             ---*/
+/*---------------------------------------------------------*/
+
+static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
+{
+   AMD64RI* ri;
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      vex_printf("\n-- goto {");
+      ppIRJumpKind(jk);
+      vex_printf("} ");
+      ppIRExpr(next);
+      vex_printf("\n");
+   }
+   ri = iselIntExpr_RI(env, next);
+   addInstr(env, AMD64Instr_Goto(jk, Acc_ALWAYS,ri));
+}
+
+
+/*---------------------------------------------------------*/
+/*--- Insn selector top-level                           ---*/
+/*---------------------------------------------------------*/
+
+/* Translate an entire SB to amd64 code. */
+
+HInstrArray* iselSB_AMD64 ( IRSB* bb, VexArch      arch_host,
+                                      VexArchInfo* archinfo_host,
+                                      VexAbiInfo*  vbi/*UNUSED*/ )
+{
+   Int      i, j;
+   HReg     hreg, hregHI;
+   ISelEnv* env;
+   UInt     hwcaps_host = archinfo_host->hwcaps;
+
+   /* sanity ... */
+   vassert(arch_host == VexArchAMD64);
+   vassert(0 == (hwcaps_host
+                 & ~(VEX_HWCAPS_AMD64_SSE3
+                     | VEX_HWCAPS_AMD64_CX16
+                     | VEX_HWCAPS_AMD64_LZCNT)));
+
+   /* Make up an initial environment to use. */
+   env = LibVEX_Alloc(sizeof(ISelEnv));
+   env->vreg_ctr = 0;
+
+   /* Set up output code array. */
+   env->code = newHInstrArray();
+
+   /* Copy BB's type env. */
+   env->type_env = bb->tyenv;
+
+   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
+      change as we go along. */
+   env->n_vregmap = bb->tyenv->types_used;
+   env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
+   env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
+
+   /* and finally ... */
+   env->hwcaps = hwcaps_host;
+
+   /* For each IR temporary, allocate a suitably-kinded virtual
+      register. */
+   j = 0;
+   for (i = 0; i < env->n_vregmap; i++) {
+      hregHI = hreg = INVALID_HREG;
+      switch (bb->tyenv->types[i]) {
+         case Ity_I1:
+         case Ity_I8:
+         case Ity_I16:
+         case Ity_I32:
+         case Ity_I64:  hreg   = mkHReg(j++, HRcInt64, True); break;
+         case Ity_I128: hreg   = mkHReg(j++, HRcInt64, True);
+                        hregHI = mkHReg(j++, HRcInt64, True); break;
+         case Ity_F32:
+         case Ity_F64:
+         case Ity_V128: hreg   = mkHReg(j++, HRcVec128, True); break;
+         default: ppIRType(bb->tyenv->types[i]);
+                  vpanic("iselBB(amd64): IRTemp type");
+      }
+      env->vregmap[i]   = hreg;
+      env->vregmapHI[i] = hregHI;
+   }
+   env->vreg_ctr = j;
+
+   /* Ok, finally we can iterate over the statements. */
+   for (i = 0; i < bb->stmts_used; i++)
+      if (bb->stmts[i])
+         iselStmt(env,bb->stmts[i]);
+
+   iselNext(env,bb->next,bb->jumpkind);
+
+   /* record the number of vregs we used. */
+   env->code->n_vregs = env->vreg_ctr;
+   return env->code;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                   host_amd64_isel.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_arm_defs.c b/VEX/priv/host_arm_defs.c
new file mode 100644
index 0000000..122a9f9
--- /dev/null
+++ b/VEX/priv/host_arm_defs.c

@@ -0,0 +1,4097 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                   host_arm_defs.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   NEON support is
+   Copyright (C) 2010-2010 Samsung Electronics
+   contributed by Dmitry Zhurikhin <zhur@ispras.ru>
+              and Kirill Batuzov <batuzovk@ispras.ru>
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex.h"
+#include "libvex_trc_values.h"
+
+#include "main_util.h"
+#include "host_generic_regs.h"
+#include "host_arm_defs.h"
+
+UInt arm_hwcaps = 0;
+
+
+/* --------- Registers. --------- */
+
+/* The usual HReg abstraction.
+   There are 16 general purpose regs.
+*/
+
+void ppHRegARM ( HReg reg )  {
+   Int r;
+   /* Be generic for all virtual regs. */
+   if (hregIsVirtual(reg)) {
+      ppHReg(reg);
+      return;
+   }
+   /* But specific for real regs. */
+   switch (hregClass(reg)) {
+      case HRcInt32:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 16);
+         vex_printf("r%d", r);
+         return;
+      case HRcFlt64:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 32);
+         vex_printf("d%d", r);
+         return;
+      case HRcFlt32:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 32);
+         vex_printf("s%d", r);
+         return;
+      case HRcVec128:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 16);
+         vex_printf("q%d", r);
+         return;
+      default:
+         vpanic("ppHRegARM");
+   }
+}
+
+HReg hregARM_R0  ( void ) { return mkHReg(0,  HRcInt32, False); }
+HReg hregARM_R1  ( void ) { return mkHReg(1,  HRcInt32, False); }
+HReg hregARM_R2  ( void ) { return mkHReg(2,  HRcInt32, False); }
+HReg hregARM_R3  ( void ) { return mkHReg(3,  HRcInt32, False); }
+HReg hregARM_R4  ( void ) { return mkHReg(4,  HRcInt32, False); }
+HReg hregARM_R5  ( void ) { return mkHReg(5,  HRcInt32, False); }
+HReg hregARM_R6  ( void ) { return mkHReg(6,  HRcInt32, False); }
+HReg hregARM_R7  ( void ) { return mkHReg(7,  HRcInt32, False); }
+HReg hregARM_R8  ( void ) { return mkHReg(8,  HRcInt32, False); }
+HReg hregARM_R9  ( void ) { return mkHReg(9,  HRcInt32, False); }
+HReg hregARM_R10 ( void ) { return mkHReg(10, HRcInt32, False); }
+HReg hregARM_R11 ( void ) { return mkHReg(11, HRcInt32, False); }
+HReg hregARM_R12 ( void ) { return mkHReg(12, HRcInt32, False); }
+HReg hregARM_R13 ( void ) { return mkHReg(13, HRcInt32, False); }
+HReg hregARM_R14 ( void ) { return mkHReg(14, HRcInt32, False); }
+HReg hregARM_R15 ( void ) { return mkHReg(15, HRcInt32, False); }
+HReg hregARM_D8  ( void ) { return mkHReg(8,  HRcFlt64, False); }
+HReg hregARM_D9  ( void ) { return mkHReg(9,  HRcFlt64, False); }
+HReg hregARM_D10 ( void ) { return mkHReg(10, HRcFlt64, False); }
+HReg hregARM_D11 ( void ) { return mkHReg(11, HRcFlt64, False); }
+HReg hregARM_D12 ( void ) { return mkHReg(12, HRcFlt64, False); }
+HReg hregARM_S26 ( void ) { return mkHReg(26, HRcFlt32, False); }
+HReg hregARM_S27 ( void ) { return mkHReg(27, HRcFlt32, False); }
+HReg hregARM_S28 ( void ) { return mkHReg(28, HRcFlt32, False); }
+HReg hregARM_S29 ( void ) { return mkHReg(29, HRcFlt32, False); }
+HReg hregARM_S30 ( void ) { return mkHReg(30, HRcFlt32, False); }
+HReg hregARM_Q8  ( void ) { return mkHReg(8,  HRcVec128, False); }
+HReg hregARM_Q9  ( void ) { return mkHReg(9,  HRcVec128, False); }
+HReg hregARM_Q10 ( void ) { return mkHReg(10, HRcVec128, False); }
+HReg hregARM_Q11 ( void ) { return mkHReg(11, HRcVec128, False); }
+HReg hregARM_Q12 ( void ) { return mkHReg(12, HRcVec128, False); }
+HReg hregARM_Q13 ( void ) { return mkHReg(13, HRcVec128, False); }
+HReg hregARM_Q14 ( void ) { return mkHReg(14, HRcVec128, False); }
+HReg hregARM_Q15 ( void ) { return mkHReg(15, HRcVec128, False); }
+
+void getAllocableRegs_ARM ( Int* nregs, HReg** arr )
+{
+   Int i = 0;
+   *nregs = 26;
+   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
+   // callee saves ones are listed first, since we prefer them
+   // if they're available
+   (*arr)[i++] = hregARM_R4();
+   (*arr)[i++] = hregARM_R5();
+   (*arr)[i++] = hregARM_R6();
+   (*arr)[i++] = hregARM_R7();
+   (*arr)[i++] = hregARM_R10();
+   (*arr)[i++] = hregARM_R11();
+   // otherwise we'll have to slum it out with caller-saves ones
+   (*arr)[i++] = hregARM_R0();
+   (*arr)[i++] = hregARM_R1();
+   (*arr)[i++] = hregARM_R2();
+   (*arr)[i++] = hregARM_R3();
+   (*arr)[i++] = hregARM_R9();
+   // FP hreegisters.  Note: these are all callee-save.  Yay!
+   // Hence we don't need to mention them as trashed in
+   // getHRegUsage for ARMInstr_Call.
+   (*arr)[i++] = hregARM_D8();
+   (*arr)[i++] = hregARM_D9();
+   (*arr)[i++] = hregARM_D10();
+   (*arr)[i++] = hregARM_D11();
+   (*arr)[i++] = hregARM_D12();
+   (*arr)[i++] = hregARM_S26();
+   (*arr)[i++] = hregARM_S27();
+   (*arr)[i++] = hregARM_S28();
+   (*arr)[i++] = hregARM_S29();
+   (*arr)[i++] = hregARM_S30();
+
+   (*arr)[i++] = hregARM_Q8();
+   (*arr)[i++] = hregARM_Q9();
+   (*arr)[i++] = hregARM_Q10();
+   (*arr)[i++] = hregARM_Q11();
+   (*arr)[i++] = hregARM_Q12();
+
+   //(*arr)[i++] = hregARM_Q13();
+   //(*arr)[i++] = hregARM_Q14();
+   //(*arr)[i++] = hregARM_Q15();
+
+   // unavail: r8 as GSP
+   // r12 is used as a spill/reload temporary
+   // r13 as SP
+   // r14 as LR
+   // r15 as PC
+   //
+   // All in all, we have 11 allocatable integer registers:
+   // 0 1 2 3 4 5 6 7 9 10 11, with r8 dedicated as GSP
+   // and r12 dedicated as a spill temporary.
+   // 13 14 and 15 are not under the allocator's control.
+   //
+   // Hence for the allocatable registers we have:
+   //
+   // callee-saved: 4 5 6 7 (8) 9 10 11
+   // caller-saved: 0 1 2 3
+   // Note 9 is ambiguous: the base EABI does not give an e/r-saved
+   // designation for it, but the Linux instantiation of the ABI
+   // specifies it as callee-saved.
+   //
+   // If the set of available registers changes or if the e/r status
+   // changes, be sure to re-check/sync the definition of
+   // getHRegUsage for ARMInstr_Call too.
+   vassert(i == *nregs);
+}
+
+
+
+/* --------- Condition codes, ARM encoding. --------- */
+
+HChar* showARMCondCode ( ARMCondCode cond ) {
+   switch (cond) {
+       case ARMcc_EQ:  return "eq";
+       case ARMcc_NE:  return "ne";
+       case ARMcc_HS:  return "hs";
+       case ARMcc_LO:  return "lo";
+       case ARMcc_MI:  return "mi";
+       case ARMcc_PL:  return "pl";
+       case ARMcc_VS:  return "vs";
+       case ARMcc_VC:  return "vc";
+       case ARMcc_HI:  return "hi";
+       case ARMcc_LS:  return "ls";
+       case ARMcc_GE:  return "ge";
+       case ARMcc_LT:  return "lt";
+       case ARMcc_GT:  return "gt";
+       case ARMcc_LE:  return "le";
+       case ARMcc_AL:  return "al"; // default
+       case ARMcc_NV:  return "nv";
+       default: vpanic("showARMCondCode");
+   }
+}
+
+
+/* --------- Mem AModes: Addressing Mode 1 --------- */
+
+ARMAMode1* ARMAMode1_RI  ( HReg reg, Int simm13 ) {
+   ARMAMode1* am        = LibVEX_Alloc(sizeof(ARMAMode1));
+   am->tag              = ARMam1_RI;
+   am->ARMam1.RI.reg    = reg;
+   am->ARMam1.RI.simm13 = simm13;
+   vassert(-4095 <= simm13 && simm13 <= 4095);
+   return am;
+}
+ARMAMode1* ARMAMode1_RRS ( HReg base, HReg index, UInt shift ) {
+   ARMAMode1* am        = LibVEX_Alloc(sizeof(ARMAMode1));
+   am->tag              = ARMam1_RRS;
+   am->ARMam1.RRS.base  = base;
+   am->ARMam1.RRS.index = index;
+   am->ARMam1.RRS.shift = shift;
+   vassert(0 <= shift && shift <= 3);
+   return am;
+}
+
+void ppARMAMode1 ( ARMAMode1* am ) {
+   switch (am->tag) {
+      case ARMam1_RI:
+         vex_printf("%d(", am->ARMam1.RI.simm13);
+         ppHRegARM(am->ARMam1.RI.reg);
+         vex_printf(")");
+         break;
+      case ARMam1_RRS:
+         vex_printf("(");
+         ppHRegARM(am->ARMam1.RRS.base);
+         vex_printf(",");
+         ppHRegARM(am->ARMam1.RRS.index);
+         vex_printf(",%u)", am->ARMam1.RRS.shift);
+         break;
+      default:
+         vassert(0);
+   }
+}
+
+static void addRegUsage_ARMAMode1 ( HRegUsage* u, ARMAMode1* am ) {
+   switch (am->tag) {
+      case ARMam1_RI:
+         addHRegUse(u, HRmRead, am->ARMam1.RI.reg);
+         return;
+      case ARMam1_RRS:
+         //    addHRegUse(u, HRmRead, am->ARMam1.RRS.base);
+         //    addHRegUse(u, HRmRead, am->ARMam1.RRS.index);
+         //   return;
+      default:
+         vpanic("addRegUsage_ARMAmode1");
+   }
+}
+
+static void mapRegs_ARMAMode1 ( HRegRemap* m, ARMAMode1* am ) {
+   switch (am->tag) {
+      case ARMam1_RI:
+         am->ARMam1.RI.reg = lookupHRegRemap(m, am->ARMam1.RI.reg);
+         return;
+      case ARMam1_RRS:
+         //am->ARMam1.RR.base =lookupHRegRemap(m, am->ARMam1.RR.base);
+         //am->ARMam1.RR.index = lookupHRegRemap(m, am->ARMam1.RR.index);
+         //return;
+      default:
+         vpanic("mapRegs_ARMAmode1");
+   }
+}
+
+
+/* --------- Mem AModes: Addressing Mode 2 --------- */
+
+ARMAMode2* ARMAMode2_RI ( HReg reg, Int simm9 ) {
+   ARMAMode2* am       = LibVEX_Alloc(sizeof(ARMAMode2));
+   am->tag             = ARMam2_RI;
+   am->ARMam2.RI.reg   = reg;
+   am->ARMam2.RI.simm9 = simm9;
+   vassert(-255 <= simm9 && simm9 <= 255);
+   return am;
+}
+ARMAMode2* ARMAMode2_RR ( HReg base, HReg index ) {
+   ARMAMode2* am       = LibVEX_Alloc(sizeof(ARMAMode2));
+   am->tag             = ARMam2_RR;
+   am->ARMam2.RR.base  = base;
+   am->ARMam2.RR.index = index;
+   return am;
+}
+
+void ppARMAMode2 ( ARMAMode2* am ) {
+   switch (am->tag) {
+      case ARMam2_RI:
+         vex_printf("%d(", am->ARMam2.RI.simm9);
+         ppHRegARM(am->ARMam2.RI.reg);
+         vex_printf(")");
+         break;
+      case ARMam2_RR:
+         vex_printf("(");
+         ppHRegARM(am->ARMam2.RR.base);
+         vex_printf(",");
+         ppHRegARM(am->ARMam2.RR.index);
+         vex_printf(")");
+         break;
+      default:
+         vassert(0);
+   }
+}
+
+static void addRegUsage_ARMAMode2 ( HRegUsage* u, ARMAMode2* am ) {
+   switch (am->tag) {
+      case ARMam2_RI:
+         addHRegUse(u, HRmRead, am->ARMam2.RI.reg);
+         return;
+      case ARMam2_RR:
+         //    addHRegUse(u, HRmRead, am->ARMam2.RR.base);
+         //    addHRegUse(u, HRmRead, am->ARMam2.RR.index);
+         //   return;
+      default:
+         vpanic("addRegUsage_ARMAmode2");
+   }
+}
+
+static void mapRegs_ARMAMode2 ( HRegRemap* m, ARMAMode2* am ) {
+   switch (am->tag) {
+      case ARMam2_RI:
+         am->ARMam2.RI.reg = lookupHRegRemap(m, am->ARMam2.RI.reg);
+         return;
+      case ARMam2_RR:
+         //am->ARMam2.RR.base =lookupHRegRemap(m, am->ARMam2.RR.base);
+         //am->ARMam2.RR.index = lookupHRegRemap(m, am->ARMam2.RR.index);
+         //return;
+      default:
+         vpanic("mapRegs_ARMAmode2");
+   }
+}
+
+
+/* --------- Mem AModes: Addressing Mode VFP --------- */
+
+ARMAModeV* mkARMAModeV ( HReg reg, Int simm11 ) {
+   ARMAModeV* am = LibVEX_Alloc(sizeof(ARMAModeV));
+   vassert(simm11 >= -1020 && simm11 <= 1020);
+   vassert(0 == (simm11 & 3));
+   am->reg    = reg;
+   am->simm11 = simm11;
+   return am;
+}
+
+void ppARMAModeV ( ARMAModeV* am ) {
+   vex_printf("%d(", am->simm11);
+   ppHRegARM(am->reg);
+   vex_printf(")");
+}
+
+static void addRegUsage_ARMAModeV ( HRegUsage* u, ARMAModeV* am ) {
+   addHRegUse(u, HRmRead, am->reg);
+}
+
+static void mapRegs_ARMAModeV ( HRegRemap* m, ARMAModeV* am ) {
+   am->reg = lookupHRegRemap(m, am->reg);
+}
+
+
+/* --------- Mem AModes: Addressing Mode Neon ------- */
+
+ARMAModeN *mkARMAModeN_RR ( HReg rN, HReg rM ) {
+   ARMAModeN* am = LibVEX_Alloc(sizeof(ARMAModeN));
+   am->tag = ARMamN_RR;
+   am->ARMamN.RR.rN = rN;
+   am->ARMamN.RR.rM = rM;
+   return am;
+}
+
+ARMAModeN *mkARMAModeN_R ( HReg rN ) {
+   ARMAModeN* am = LibVEX_Alloc(sizeof(ARMAModeN));
+   am->tag = ARMamN_R;
+   am->ARMamN.R.rN = rN;
+   return am;
+}
+
+static void addRegUsage_ARMAModeN ( HRegUsage* u, ARMAModeN* am ) {
+   if (am->tag == ARMamN_R) {
+      addHRegUse(u, HRmRead, am->ARMamN.R.rN);
+   } else {
+      addHRegUse(u, HRmRead, am->ARMamN.RR.rN);
+      addHRegUse(u, HRmRead, am->ARMamN.RR.rM);
+   }
+}
+
+static void mapRegs_ARMAModeN ( HRegRemap* m, ARMAModeN* am ) {
+   if (am->tag == ARMamN_R) {
+      am->ARMamN.R.rN = lookupHRegRemap(m, am->ARMamN.R.rN);
+   } else {
+      am->ARMamN.RR.rN = lookupHRegRemap(m, am->ARMamN.RR.rN);
+      am->ARMamN.RR.rM = lookupHRegRemap(m, am->ARMamN.RR.rM);
+   }
+}
+
+void ppARMAModeN ( ARMAModeN* am ) {
+   vex_printf("[");
+   if (am->tag == ARMamN_R) {
+      ppHRegARM(am->ARMamN.R.rN);
+   } else {
+      ppHRegARM(am->ARMamN.RR.rN);
+   }
+   vex_printf("]");
+   if (am->tag == ARMamN_RR) {
+      vex_printf(", ");
+      ppHRegARM(am->ARMamN.RR.rM);
+   }
+}
+
+
+/* --------- Reg or imm-8x4 operands --------- */
+
+static UInt ROR32 ( UInt x, UInt sh ) {
+   vassert(sh >= 0 && sh < 32);
+   if (sh == 0)
+      return x;
+   else
+      return (x << (32-sh)) | (x >> sh);
+}
+
+ARMRI84* ARMRI84_I84 ( UShort imm8, UShort imm4 ) {
+   ARMRI84* ri84          = LibVEX_Alloc(sizeof(ARMRI84));
+   ri84->tag              = ARMri84_I84;
+   ri84->ARMri84.I84.imm8 = imm8;
+   ri84->ARMri84.I84.imm4 = imm4;
+   vassert(imm8 >= 0 && imm8 <= 255);
+   vassert(imm4 >= 0 && imm4 <= 15);
+   return ri84;
+}
+ARMRI84* ARMRI84_R ( HReg reg ) {
+   ARMRI84* ri84       = LibVEX_Alloc(sizeof(ARMRI84));
+   ri84->tag           = ARMri84_R;
+   ri84->ARMri84.R.reg = reg;
+   return ri84;
+}
+
+void ppARMRI84 ( ARMRI84* ri84 ) {
+   switch (ri84->tag) {
+      case ARMri84_I84:
+         vex_printf("0x%x", ROR32(ri84->ARMri84.I84.imm8,
+                                  2 * ri84->ARMri84.I84.imm4));
+         break;
+      case ARMri84_R:
+         ppHRegARM(ri84->ARMri84.R.reg);
+         break;
+      default:
+         vassert(0);
+   }
+}
+
+static void addRegUsage_ARMRI84 ( HRegUsage* u, ARMRI84* ri84 ) {
+   switch (ri84->tag) {
+      case ARMri84_I84:
+         return;
+      case ARMri84_R:
+         addHRegUse(u, HRmRead, ri84->ARMri84.R.reg);
+         return;
+      default:
+         vpanic("addRegUsage_ARMRI84");
+   }
+}
+
+static void mapRegs_ARMRI84 ( HRegRemap* m, ARMRI84* ri84 ) {
+   switch (ri84->tag) {
+      case ARMri84_I84:
+         return;
+      case ARMri84_R:
+         ri84->ARMri84.R.reg = lookupHRegRemap(m, ri84->ARMri84.R.reg);
+         return;
+      default:
+         vpanic("mapRegs_ARMRI84");
+   }
+}
+
+
+/* --------- Reg or imm5 operands --------- */
+
+ARMRI5* ARMRI5_I5 ( UInt imm5 ) {
+   ARMRI5* ri5         = LibVEX_Alloc(sizeof(ARMRI5));
+   ri5->tag            = ARMri5_I5;
+   ri5->ARMri5.I5.imm5 = imm5;
+   vassert(imm5 > 0 && imm5 <= 31); // zero is not allowed
+   return ri5;
+}
+ARMRI5* ARMRI5_R ( HReg reg ) {
+   ARMRI5* ri5       = LibVEX_Alloc(sizeof(ARMRI5));
+   ri5->tag          = ARMri5_R;
+   ri5->ARMri5.R.reg = reg;
+   return ri5;
+}
+
+void ppARMRI5 ( ARMRI5* ri5 ) {
+   switch (ri5->tag) {
+      case ARMri5_I5:
+         vex_printf("%u", ri5->ARMri5.I5.imm5);
+         break;
+      case ARMri5_R:
+         ppHRegARM(ri5->ARMri5.R.reg);
+         break;
+      default:
+         vassert(0);
+   }
+}
+
+static void addRegUsage_ARMRI5 ( HRegUsage* u, ARMRI5* ri5 ) {
+   switch (ri5->tag) {
+      case ARMri5_I5:
+         return;
+      case ARMri5_R:
+         addHRegUse(u, HRmRead, ri5->ARMri5.R.reg);
+         return;
+      default:
+         vpanic("addRegUsage_ARMRI5");
+   }
+}
+
+static void mapRegs_ARMRI5 ( HRegRemap* m, ARMRI5* ri5 ) {
+   switch (ri5->tag) {
+      case ARMri5_I5:
+         return;
+      case ARMri5_R:
+         ri5->ARMri5.R.reg = lookupHRegRemap(m, ri5->ARMri5.R.reg);
+         return;
+      default:
+         vpanic("mapRegs_ARMRI5");
+   }
+}
+
+/* -------- Neon Immediate operatnd --------- */
+
+ARMNImm* ARMNImm_TI ( UInt type, UInt imm8 ) {
+   ARMNImm* i = LibVEX_Alloc(sizeof(ARMNImm));
+   i->type = type;
+   i->imm8 = imm8;
+   return i;
+}
+
+ULong ARMNImm_to_Imm64 ( ARMNImm* imm ) {
+   int i, j;
+   ULong y, x = imm->imm8;
+   switch (imm->type) {
+      case 3:
+         x = x << 8;
+      case 2:
+         x = x << 8;
+      case 1:
+         x = x << 8;
+      case 0:
+         return (x << 32) | x;
+      case 5:
+      case 6:
+         if (imm->type == 5)
+            x = x << 8;
+         else
+            x = (x << 8) | x;
+      case 4:
+         x = (x << 16) | x;
+         return (x << 32) | x;
+      case 8:
+         x = (x << 8) | 0xFF;
+      case 7:
+         x = (x << 8) | 0xFF;
+         return (x << 32) | x;
+      case 9:
+         x = 0;
+         for (i = 7; i >= 0; i--) {
+            y = ((ULong)imm->imm8 >> i) & 1;
+            for (j = 0; j < 8; j++) {
+               x = (x << 1) | y;
+            }
+         }
+         return x;
+      case 10:
+         x |= (x & 0x80) << 5;
+         x |= ~(x & 0x40) << 5;
+         x &= 0x187F; /* 0001 1000 0111 1111 */
+         x |= (x & 0x40) << 4;
+         x |= (x & 0x40) << 3;
+         x |= (x & 0x40) << 2;
+         x |= (x & 0x40) << 1;
+         x = x << 19;
+         x = (x << 32) | x;
+         return x;
+      default:
+         vpanic("ARMNImm_to_Imm64");
+   }
+}
+
+ARMNImm* Imm64_to_ARMNImm ( ULong x ) {
+   ARMNImm tmp;
+   if ((x & 0xFFFFFFFF) == (x >> 32)) {
+      if ((x & 0xFFFFFF00) == 0)
+         return ARMNImm_TI(0, x & 0xFF);
+      if ((x & 0xFFFF00FF) == 0)
+         return ARMNImm_TI(1, (x >> 8) & 0xFF);
+      if ((x & 0xFF00FFFF) == 0)
+         return ARMNImm_TI(2, (x >> 16) & 0xFF);
+      if ((x & 0x00FFFFFF) == 0)
+         return ARMNImm_TI(3, (x >> 24) & 0xFF);
+      if ((x & 0xFFFF00FF) == 0xFF)
+         return ARMNImm_TI(7, (x >> 8) & 0xFF);
+      if ((x & 0xFF00FFFF) == 0xFFFF)
+         return ARMNImm_TI(8, (x >> 16) & 0xFF);
+      if ((x & 0xFFFF) == ((x >> 16) & 0xFFFF)) {
+         if ((x & 0xFF00) == 0)
+            return ARMNImm_TI(4, x & 0xFF);
+         if ((x & 0x00FF) == 0)
+            return ARMNImm_TI(5, (x >> 8) & 0xFF);
+         if ((x & 0xFF) == ((x >> 8) & 0xFF))
+            return ARMNImm_TI(6, x & 0xFF);
+      }
+      if ((x & 0x7FFFF) == 0) {
+         tmp.type = 10;
+         tmp.imm8 = ((x >> 19) & 0x7F) | ((x >> 24) & 0x80);
+         if (ARMNImm_to_Imm64(&tmp) == x)
+            return ARMNImm_TI(tmp.type, tmp.imm8);
+      }
+   } else {
+      /* This can only be type 9. */
+      tmp.imm8 = (((x >> 56) & 1) << 7)
+               | (((x >> 48) & 1) << 6)
+               | (((x >> 40) & 1) << 5)
+               | (((x >> 32) & 1) << 4)
+               | (((x >> 24) & 1) << 3)
+               | (((x >> 16) & 1) << 2)
+               | (((x >>  8) & 1) << 1)
+               | (((x >>  0) & 1) << 0);
+      tmp.type = 9;
+      if (ARMNImm_to_Imm64 (&tmp) == x)
+         return ARMNImm_TI(tmp.type, tmp.imm8);
+   }
+   return NULL;
+}
+
+void ppARMNImm (ARMNImm* i) {
+   ULong x = ARMNImm_to_Imm64(i);
+   vex_printf("0x%llX%llX", x, x);
+}
+
+/* -- Register or scalar operand --- */
+
+ARMNRS* mkARMNRS(ARMNRS_tag tag, HReg reg, UInt index)
+{
+   ARMNRS *p = LibVEX_Alloc(sizeof(ARMNRS));
+   p->tag = tag;
+   p->reg = reg;
+   p->index = index;
+   return p;
+}
+
+void ppARMNRS(ARMNRS *p)
+{
+   ppHRegARM(p->reg);
+   if (p->tag == ARMNRS_Scalar) {
+      vex_printf("[%d]", p->index);
+   }
+}
+
+/* --------- Instructions. --------- */
+
+HChar* showARMAluOp ( ARMAluOp op ) {
+   switch (op) {
+      case ARMalu_ADD:  return "add";
+      case ARMalu_ADDS: return "adds";
+      case ARMalu_ADC:  return "adc";
+      case ARMalu_SUB:  return "sub";
+      case ARMalu_SUBS: return "subs";
+      case ARMalu_SBC:  return "sbc";
+      case ARMalu_AND:  return "and";
+      case ARMalu_BIC:  return "bic";
+      case ARMalu_OR:   return "orr";
+      case ARMalu_XOR:  return "xor";
+      default: vpanic("showARMAluOp");
+   }
+}
+
+HChar* showARMShiftOp ( ARMShiftOp op ) {
+   switch (op) {
+      case ARMsh_SHL: return "shl";
+      case ARMsh_SHR: return "shr";
+      case ARMsh_SAR: return "sar";
+      default: vpanic("showARMShiftOp");
+   }
+}
+
+HChar* showARMUnaryOp ( ARMUnaryOp op ) {
+   switch (op) {
+      case ARMun_NEG: return "neg";
+      case ARMun_NOT: return "not";
+      case ARMun_CLZ: return "clz";
+      default: vpanic("showARMUnaryOp");
+   }
+}
+
+HChar* showARMMulOp ( ARMMulOp op ) {
+   switch (op) {
+      case ARMmul_PLAIN: return "mul";
+      case ARMmul_ZX:    return "umull";
+      case ARMmul_SX:    return "smull";
+      default: vpanic("showARMMulOp");
+   }
+}
+
+HChar* showARMVfpOp ( ARMVfpOp op ) {
+   switch (op) {
+      case ARMvfp_ADD: return "add";
+      case ARMvfp_SUB: return "sub";
+      case ARMvfp_MUL: return "mul";
+      case ARMvfp_DIV: return "div";
+      default: vpanic("showARMVfpOp");
+   }
+}
+
+HChar* showARMVfpUnaryOp ( ARMVfpUnaryOp op ) {
+   switch (op) {
+      case ARMvfpu_COPY: return "cpy";
+      case ARMvfpu_NEG:  return "neg";
+      case ARMvfpu_ABS:  return "abs";
+      case ARMvfpu_SQRT: return "sqrt";
+      default: vpanic("showARMVfpUnaryOp");
+   }
+}
+
+HChar* showARMNeonBinOp ( ARMNeonBinOp op ) {
+   switch (op) {
+      case ARMneon_VAND: return "vand";
+      case ARMneon_VORR: return "vorr";
+      case ARMneon_VXOR: return "veor";
+      case ARMneon_VADD: return "vadd";
+      case ARMneon_VRHADDS: return "vrhadd";
+      case ARMneon_VRHADDU: return "vrhadd";
+      case ARMneon_VADDFP: return "vadd";
+      case ARMneon_VPADDFP: return "vpadd";
+      case ARMneon_VABDFP: return "vabd";
+      case ARMneon_VSUB: return "vsub";
+      case ARMneon_VSUBFP: return "vsub";
+      case ARMneon_VMINU: return "vmin";
+      case ARMneon_VMINS: return "vmin";
+      case ARMneon_VMINF: return "vmin";
+      case ARMneon_VMAXU: return "vmax";
+      case ARMneon_VMAXS: return "vmax";
+      case ARMneon_VMAXF: return "vmax";
+      case ARMneon_VQADDU: return "vqadd";
+      case ARMneon_VQADDS: return "vqadd";
+      case ARMneon_VQSUBU: return "vqsub";
+      case ARMneon_VQSUBS: return "vqsub";
+      case ARMneon_VCGTU:  return "vcgt";
+      case ARMneon_VCGTS:  return "vcgt";
+      case ARMneon_VCGTF:  return "vcgt";
+      case ARMneon_VCGEF:  return "vcgt";
+      case ARMneon_VCGEU:  return "vcge";
+      case ARMneon_VCGES:  return "vcge";
+      case ARMneon_VCEQ:  return "vceq";
+      case ARMneon_VCEQF:  return "vceq";
+      case ARMneon_VPADD:   return "vpadd";
+      case ARMneon_VPMINU:   return "vpmin";
+      case ARMneon_VPMINS:   return "vpmin";
+      case ARMneon_VPMINF:   return "vpmin";
+      case ARMneon_VPMAXU:   return "vpmax";
+      case ARMneon_VPMAXS:   return "vpmax";
+      case ARMneon_VPMAXF:   return "vpmax";
+      case ARMneon_VEXT:   return "vext";
+      case ARMneon_VMUL:   return "vmuli";
+      case ARMneon_VMULLU:   return "vmull";
+      case ARMneon_VMULLS:   return "vmull";
+      case ARMneon_VMULP:  return "vmul";
+      case ARMneon_VMULFP:  return "vmul";
+      case ARMneon_VMULLP:  return "vmul";
+      case ARMneon_VQDMULH: return "vqdmulh";
+      case ARMneon_VQRDMULH: return "vqrdmulh";
+      case ARMneon_VQDMULL: return "vqdmull";
+      case ARMneon_VTBL: return "vtbl";
+      case ARMneon_VRECPS: return "vrecps";
+      case ARMneon_VRSQRTS: return "vrecps";
+      /* ... */
+      default: vpanic("showARMNeonBinOp");
+   }
+}
+
+HChar* showARMNeonBinOpDataType ( ARMNeonBinOp op ) {
+   switch (op) {
+      case ARMneon_VAND:
+      case ARMneon_VORR:
+      case ARMneon_VXOR:
+         return "";
+      case ARMneon_VADD:
+      case ARMneon_VSUB:
+      case ARMneon_VEXT:
+      case ARMneon_VMUL:
+      case ARMneon_VPADD:
+      case ARMneon_VTBL:
+      case ARMneon_VCEQ:
+         return ".i";
+      case ARMneon_VRHADDU:
+      case ARMneon_VMINU:
+      case ARMneon_VMAXU:
+      case ARMneon_VQADDU:
+      case ARMneon_VQSUBU:
+      case ARMneon_VCGTU:
+      case ARMneon_VCGEU:
+      case ARMneon_VMULLU:
+      case ARMneon_VPMINU:
+      case ARMneon_VPMAXU:
+         return ".u";
+      case ARMneon_VRHADDS:
+      case ARMneon_VMINS:
+      case ARMneon_VMAXS:
+      case ARMneon_VQADDS:
+      case ARMneon_VQSUBS:
+      case ARMneon_VCGTS:
+      case ARMneon_VCGES:
+      case ARMneon_VQDMULL:
+      case ARMneon_VMULLS:
+      case ARMneon_VPMINS:
+      case ARMneon_VPMAXS:
+      case ARMneon_VQDMULH:
+      case ARMneon_VQRDMULH:
+         return ".s";
+      case ARMneon_VMULP:
+      case ARMneon_VMULLP:
+         return ".p";
+      case ARMneon_VADDFP:
+      case ARMneon_VABDFP:
+      case ARMneon_VPADDFP:
+      case ARMneon_VSUBFP:
+      case ARMneon_VMULFP:
+      case ARMneon_VMINF:
+      case ARMneon_VMAXF:
+      case ARMneon_VPMINF:
+      case ARMneon_VPMAXF:
+      case ARMneon_VCGTF:
+      case ARMneon_VCGEF:
+      case ARMneon_VCEQF:
+      case ARMneon_VRECPS:
+      case ARMneon_VRSQRTS:
+         return ".f";
+      /* ... */
+      default: vpanic("showARMNeonBinOpDataType");
+   }
+}
+
+HChar* showARMNeonUnOp ( ARMNeonUnOp op ) {
+   switch (op) {
+      case ARMneon_COPY: return "vmov";
+      case ARMneon_COPYLS: return "vmov";
+      case ARMneon_COPYLU: return "vmov";
+      case ARMneon_COPYN: return "vmov";
+      case ARMneon_COPYQNSS: return "vqmovn";
+      case ARMneon_COPYQNUS: return "vqmovun";
+      case ARMneon_COPYQNUU: return "vqmovn";
+      case ARMneon_NOT: return "vmvn";
+      case ARMneon_EQZ: return "vceq";
+      case ARMneon_CNT: return "vcnt";
+      case ARMneon_CLS: return "vcls";
+      case ARMneon_CLZ: return "vclz";
+      case ARMneon_DUP: return "vdup";
+      case ARMneon_PADDLS: return "vpaddl";
+      case ARMneon_PADDLU: return "vpaddl";
+      case ARMneon_VQSHLNSS: return "vqshl";
+      case ARMneon_VQSHLNUU: return "vqshl";
+      case ARMneon_VQSHLNUS: return "vqshlu";
+      case ARMneon_REV16: return "vrev16";
+      case ARMneon_REV32: return "vrev32";
+      case ARMneon_REV64: return "vrev64";
+      case ARMneon_VCVTFtoU: return "vcvt";
+      case ARMneon_VCVTFtoS: return "vcvt";
+      case ARMneon_VCVTUtoF: return "vcvt";
+      case ARMneon_VCVTStoF: return "vcvt";
+      case ARMneon_VCVTFtoFixedU: return "vcvt";
+      case ARMneon_VCVTFtoFixedS: return "vcvt";
+      case ARMneon_VCVTFixedUtoF: return "vcvt";
+      case ARMneon_VCVTFixedStoF: return "vcvt";
+      case ARMneon_VCVTF32toF16: return "vcvt";
+      case ARMneon_VCVTF16toF32: return "vcvt";
+      case ARMneon_VRECIP: return "vrecip";
+      case ARMneon_VRECIPF: return "vrecipf";
+      case ARMneon_VNEGF: return "vneg";
+      case ARMneon_ABS: return "vabs";
+      case ARMneon_VABSFP: return "vabsfp";
+      case ARMneon_VRSQRTEFP: return "vrsqrtefp";
+      case ARMneon_VRSQRTE: return "vrsqrte";
+      /* ... */
+      default: vpanic("showARMNeonUnOp");
+   }
+}
+
+HChar* showARMNeonUnOpDataType ( ARMNeonUnOp op ) {
+   switch (op) {
+      case ARMneon_COPY:
+      case ARMneon_NOT:
+         return "";
+      case ARMneon_COPYN:
+      case ARMneon_EQZ:
+      case ARMneon_CNT:
+      case ARMneon_DUP:
+      case ARMneon_REV16:
+      case ARMneon_REV32:
+      case ARMneon_REV64:
+         return ".i";
+      case ARMneon_COPYLU:
+      case ARMneon_PADDLU:
+      case ARMneon_COPYQNUU:
+      case ARMneon_VQSHLNUU:
+      case ARMneon_VRECIP:
+      case ARMneon_VRSQRTE:
+         return ".u";
+      case ARMneon_CLS:
+      case ARMneon_CLZ:
+      case ARMneon_COPYLS:
+      case ARMneon_PADDLS:
+      case ARMneon_COPYQNSS:
+      case ARMneon_COPYQNUS:
+      case ARMneon_VQSHLNSS:
+      case ARMneon_VQSHLNUS:
+      case ARMneon_ABS:
+         return ".s";
+      case ARMneon_VRECIPF:
+      case ARMneon_VNEGF:
+      case ARMneon_VABSFP:
+      case ARMneon_VRSQRTEFP:
+         return ".f";
+      case ARMneon_VCVTFtoU: return ".u32.f32";
+      case ARMneon_VCVTFtoS: return ".s32.f32";
+      case ARMneon_VCVTUtoF: return ".f32.u32";
+      case ARMneon_VCVTStoF: return ".f32.s32";
+      case ARMneon_VCVTF16toF32: return ".f32.f16";
+      case ARMneon_VCVTF32toF16: return ".f16.f32";
+      case ARMneon_VCVTFtoFixedU: return ".u32.f32";
+      case ARMneon_VCVTFtoFixedS: return ".s32.f32";
+      case ARMneon_VCVTFixedUtoF: return ".f32.u32";
+      case ARMneon_VCVTFixedStoF: return ".f32.s32";
+      /* ... */
+      default: vpanic("showARMNeonUnOpDataType");
+   }
+}
+
+HChar* showARMNeonUnOpS ( ARMNeonUnOpS op ) {
+   switch (op) {
+      case ARMneon_SETELEM: return "vmov";
+      case ARMneon_GETELEMU: return "vmov";
+      case ARMneon_GETELEMS: return "vmov";
+      case ARMneon_VDUP: return "vdup";
+      /* ... */
+      default: vpanic("showARMNeonUnarySOp");
+   }
+}
+
+HChar* showARMNeonUnOpSDataType ( ARMNeonUnOpS op ) {
+   switch (op) {
+      case ARMneon_SETELEM:
+      case ARMneon_VDUP:
+         return ".i";
+      case ARMneon_GETELEMS:
+         return ".s";
+      case ARMneon_GETELEMU:
+         return ".u";
+      /* ... */
+      default: vpanic("showARMNeonUnarySOp");
+   }
+}
+
+HChar* showARMNeonShiftOp ( ARMNeonShiftOp op ) {
+   switch (op) {
+      case ARMneon_VSHL: return "vshl";
+      case ARMneon_VSAL: return "vshl";
+      case ARMneon_VQSHL: return "vqshl";
+      case ARMneon_VQSAL: return "vqshl";
+      /* ... */
+      default: vpanic("showARMNeonShiftOp");
+   }
+}
+
+HChar* showARMNeonShiftOpDataType ( ARMNeonShiftOp op ) {
+   switch (op) {
+      case ARMneon_VSHL:
+      case ARMneon_VQSHL:
+         return ".u";
+      case ARMneon_VSAL:
+      case ARMneon_VQSAL:
+         return ".s";
+      /* ... */
+      default: vpanic("showARMNeonShiftOpDataType");
+   }
+}
+
+HChar* showARMNeonDualOp ( ARMNeonDualOp op ) {
+   switch (op) {
+      case ARMneon_TRN: return "vtrn";
+      case ARMneon_ZIP: return "vzip";
+      case ARMneon_UZP: return "vuzp";
+      /* ... */
+      default: vpanic("showARMNeonDualOp");
+   }
+}
+
+HChar* showARMNeonDualOpDataType ( ARMNeonDualOp op ) {
+   switch (op) {
+      case ARMneon_TRN:
+      case ARMneon_ZIP:
+      case ARMneon_UZP:
+         return "i";
+      /* ... */
+      default: vpanic("showARMNeonDualOp");
+   }
+}
+
+static HChar* showARMNeonDataSize_wrk ( UInt size )
+{
+   switch (size) {
+      case 0: return "8";
+      case 1: return "16";
+      case 2: return "32";
+      case 3: return "64";
+      default: vpanic("showARMNeonDataSize");
+   }
+}
+
+static HChar* showARMNeonDataSize ( ARMInstr* i )
+{
+   switch (i->tag) {
+      case ARMin_NBinary:
+         if (i->ARMin.NBinary.op == ARMneon_VEXT)
+            return "8";
+         if (i->ARMin.NBinary.op == ARMneon_VAND ||
+             i->ARMin.NBinary.op == ARMneon_VORR ||
+             i->ARMin.NBinary.op == ARMneon_VXOR)
+            return "";
+         return showARMNeonDataSize_wrk(i->ARMin.NBinary.size);
+      case ARMin_NUnary:
+         if (i->ARMin.NUnary.op == ARMneon_COPY ||
+             i->ARMin.NUnary.op == ARMneon_NOT ||
+             i->ARMin.NUnary.op == ARMneon_VCVTF32toF16||
+             i->ARMin.NUnary.op == ARMneon_VCVTF16toF32||
+             i->ARMin.NUnary.op == ARMneon_VCVTFtoFixedS ||
+             i->ARMin.NUnary.op == ARMneon_VCVTFtoFixedU ||
+             i->ARMin.NUnary.op == ARMneon_VCVTFixedStoF ||
+             i->ARMin.NUnary.op == ARMneon_VCVTFixedUtoF ||
+             i->ARMin.NUnary.op == ARMneon_VCVTFtoS ||
+             i->ARMin.NUnary.op == ARMneon_VCVTFtoU ||
+             i->ARMin.NUnary.op == ARMneon_VCVTStoF ||
+             i->ARMin.NUnary.op == ARMneon_VCVTUtoF)
+            return "";
+         if (i->ARMin.NUnary.op == ARMneon_VQSHLNSS ||
+             i->ARMin.NUnary.op == ARMneon_VQSHLNUU ||
+             i->ARMin.NUnary.op == ARMneon_VQSHLNUS) {
+            UInt size;
+            size = i->ARMin.NUnary.size;
+            if (size & 0x40)
+               return "64";
+            if (size & 0x20)
+               return "32";
+            if (size & 0x10)
+               return "16";
+            if (size & 0x08)
+               return "8";
+            vpanic("showARMNeonDataSize");
+         }
+         return showARMNeonDataSize_wrk(i->ARMin.NUnary.size);
+      case ARMin_NUnaryS:
+         if (i->ARMin.NUnaryS.op == ARMneon_VDUP) {
+            int size;
+            size = i->ARMin.NUnaryS.size;
+            if ((size & 1) == 1)
+               return "8";
+            if ((size & 3) == 2)
+               return "16";
+            if ((size & 7) == 4)
+               return "32";
+            vpanic("showARMNeonDataSize");
+         }
+         return showARMNeonDataSize_wrk(i->ARMin.NUnaryS.size);
+      case ARMin_NShift:
+         return showARMNeonDataSize_wrk(i->ARMin.NShift.size);
+      case ARMin_NDual:
+         return showARMNeonDataSize_wrk(i->ARMin.NDual.size);
+      default:
+         vpanic("showARMNeonDataSize");
+   }
+}
+
+ARMInstr* ARMInstr_Alu ( ARMAluOp op,
+                         HReg dst, HReg argL, ARMRI84* argR ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag            = ARMin_Alu;
+   i->ARMin.Alu.op   = op;
+   i->ARMin.Alu.dst  = dst;
+   i->ARMin.Alu.argL = argL;
+   i->ARMin.Alu.argR = argR;
+   return i;
+}
+ARMInstr* ARMInstr_Shift  ( ARMShiftOp op,
+                            HReg dst, HReg argL, ARMRI5* argR ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag              = ARMin_Shift;
+   i->ARMin.Shift.op   = op;
+   i->ARMin.Shift.dst  = dst;
+   i->ARMin.Shift.argL = argL;
+   i->ARMin.Shift.argR = argR;
+   return i;
+}
+ARMInstr* ARMInstr_Unary ( ARMUnaryOp op, HReg dst, HReg src ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag             = ARMin_Unary;
+   i->ARMin.Unary.op  = op;
+   i->ARMin.Unary.dst = dst;
+   i->ARMin.Unary.src = src;
+   return i;
+}
+ARMInstr* ARMInstr_CmpOrTst ( Bool isCmp, HReg argL, ARMRI84* argR ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                  = ARMin_CmpOrTst;
+   i->ARMin.CmpOrTst.isCmp = isCmp;
+   i->ARMin.CmpOrTst.argL  = argL;
+   i->ARMin.CmpOrTst.argR  = argR;
+   return i;
+}
+ARMInstr* ARMInstr_Mov ( HReg dst, ARMRI84* src ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag           = ARMin_Mov;
+   i->ARMin.Mov.dst = dst;
+   i->ARMin.Mov.src = src;
+   return i;
+}
+ARMInstr* ARMInstr_Imm32  ( HReg dst, UInt imm32 ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag               = ARMin_Imm32;
+   i->ARMin.Imm32.dst   = dst;
+   i->ARMin.Imm32.imm32 = imm32;
+   return i;
+}
+ARMInstr* ARMInstr_LdSt32 ( Bool isLoad, HReg rD, ARMAMode1* amode ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                 = ARMin_LdSt32;
+   i->ARMin.LdSt32.isLoad = isLoad;
+   i->ARMin.LdSt32.rD     = rD;
+   i->ARMin.LdSt32.amode  = amode;
+   return i;
+}
+ARMInstr* ARMInstr_LdSt16 ( Bool isLoad, Bool signedLoad,
+                            HReg rD, ARMAMode2* amode ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                     = ARMin_LdSt16;
+   i->ARMin.LdSt16.isLoad     = isLoad;
+   i->ARMin.LdSt16.signedLoad = signedLoad;
+   i->ARMin.LdSt16.rD         = rD;
+   i->ARMin.LdSt16.amode      = amode;
+   return i;
+}
+ARMInstr* ARMInstr_LdSt8U ( Bool isLoad, HReg rD, ARMAMode1* amode ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                 = ARMin_LdSt8U;
+   i->ARMin.LdSt8U.isLoad = isLoad;
+   i->ARMin.LdSt8U.rD     = rD;
+   i->ARMin.LdSt8U.amode  = amode;
+   return i;
+}
+//extern ARMInstr* ARMInstr_Ld8S   ( HReg, ARMAMode2* );
+ARMInstr* ARMInstr_Goto ( IRJumpKind jk, ARMCondCode cond, HReg gnext ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag              = ARMin_Goto;
+   i->ARMin.Goto.jk    = jk;
+   i->ARMin.Goto.cond  = cond;
+   i->ARMin.Goto.gnext = gnext;
+   return i;
+}
+ARMInstr* ARMInstr_CMov ( ARMCondCode cond, HReg dst, ARMRI84* src ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag             = ARMin_CMov;
+   i->ARMin.CMov.cond = cond;
+   i->ARMin.CMov.dst  = dst;
+   i->ARMin.CMov.src  = src;
+   vassert(cond != ARMcc_AL);
+   return i;
+}
+ARMInstr* ARMInstr_Call ( ARMCondCode cond, HWord target, Int nArgRegs ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                 = ARMin_Call;
+   i->ARMin.Call.cond     = cond;
+   i->ARMin.Call.target   = target;
+   i->ARMin.Call.nArgRegs = nArgRegs;
+   return i;
+}
+ARMInstr* ARMInstr_Mul ( ARMMulOp op ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag          = ARMin_Mul;
+   i->ARMin.Mul.op = op;
+   return i;
+}
+ARMInstr* ARMInstr_LdrEX ( Int szB ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag             = ARMin_LdrEX;
+   i->ARMin.LdrEX.szB = szB;
+   vassert(szB == 4 || szB == 1);
+   return i;
+}
+ARMInstr* ARMInstr_StrEX ( Int szB ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag             = ARMin_StrEX;
+   i->ARMin.StrEX.szB = szB;
+   vassert(szB == 4 || szB == 1);
+   return i;
+}
+ARMInstr* ARMInstr_VLdStD ( Bool isLoad, HReg dD, ARMAModeV* am ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                 = ARMin_VLdStD;
+   i->ARMin.VLdStD.isLoad = isLoad;
+   i->ARMin.VLdStD.dD     = dD;
+   i->ARMin.VLdStD.amode  = am;
+   return i;
+}
+ARMInstr* ARMInstr_VLdStS ( Bool isLoad, HReg fD, ARMAModeV* am ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                 = ARMin_VLdStS;
+   i->ARMin.VLdStS.isLoad = isLoad;
+   i->ARMin.VLdStS.fD     = fD;
+   i->ARMin.VLdStS.amode  = am;
+   return i;
+}
+ARMInstr* ARMInstr_VAluD ( ARMVfpOp op, HReg dst, HReg argL, HReg argR ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag              = ARMin_VAluD;
+   i->ARMin.VAluD.op   = op;
+   i->ARMin.VAluD.dst  = dst;
+   i->ARMin.VAluD.argL = argL;
+   i->ARMin.VAluD.argR = argR;
+   return i;
+}
+ARMInstr* ARMInstr_VAluS ( ARMVfpOp op, HReg dst, HReg argL, HReg argR ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag              = ARMin_VAluS;
+   i->ARMin.VAluS.op   = op;
+   i->ARMin.VAluS.dst  = dst;
+   i->ARMin.VAluS.argL = argL;
+   i->ARMin.VAluS.argR = argR;
+   return i;
+}
+ARMInstr* ARMInstr_VUnaryD ( ARMVfpUnaryOp op, HReg dst, HReg src ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag               = ARMin_VUnaryD;
+   i->ARMin.VUnaryD.op  = op;
+   i->ARMin.VUnaryD.dst = dst;
+   i->ARMin.VUnaryD.src = src;
+   return i;
+}
+ARMInstr* ARMInstr_VUnaryS ( ARMVfpUnaryOp op, HReg dst, HReg src ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag               = ARMin_VUnaryS;
+   i->ARMin.VUnaryS.op  = op;
+   i->ARMin.VUnaryS.dst = dst;
+   i->ARMin.VUnaryS.src = src;
+   return i;
+}
+ARMInstr* ARMInstr_VCmpD ( HReg argL, HReg argR ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag              = ARMin_VCmpD;
+   i->ARMin.VCmpD.argL = argL;
+   i->ARMin.VCmpD.argR = argR;
+   return i;
+}
+ARMInstr* ARMInstr_VCMovD ( ARMCondCode cond, HReg dst, HReg src ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag               = ARMin_VCMovD;
+   i->ARMin.VCMovD.cond = cond;
+   i->ARMin.VCMovD.dst  = dst;
+   i->ARMin.VCMovD.src  = src;
+   vassert(cond != ARMcc_AL);
+   return i;
+}
+ARMInstr* ARMInstr_VCMovS ( ARMCondCode cond, HReg dst, HReg src ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag               = ARMin_VCMovS;
+   i->ARMin.VCMovS.cond = cond;
+   i->ARMin.VCMovS.dst  = dst;
+   i->ARMin.VCMovS.src  = src;
+   vassert(cond != ARMcc_AL);
+   return i;
+}
+ARMInstr* ARMInstr_VCvtSD ( Bool sToD, HReg dst, HReg src ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag               = ARMin_VCvtSD;
+   i->ARMin.VCvtSD.sToD = sToD;
+   i->ARMin.VCvtSD.dst  = dst;
+   i->ARMin.VCvtSD.src  = src;
+   return i;
+}
+ARMInstr* ARMInstr_VXferD ( Bool toD, HReg dD, HReg rHi, HReg rLo ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag              = ARMin_VXferD;
+   i->ARMin.VXferD.toD = toD;
+   i->ARMin.VXferD.dD  = dD;
+   i->ARMin.VXferD.rHi = rHi;
+   i->ARMin.VXferD.rLo = rLo;
+   return i;
+}
+ARMInstr* ARMInstr_VXferS ( Bool toS, HReg fD, HReg rLo ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag              = ARMin_VXferS;
+   i->ARMin.VXferS.toS = toS;
+   i->ARMin.VXferS.fD  = fD;
+   i->ARMin.VXferS.rLo = rLo;
+   return i;
+}
+ARMInstr* ARMInstr_VCvtID ( Bool iToD, Bool syned,
+                            HReg dst, HReg src ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                = ARMin_VCvtID;
+   i->ARMin.VCvtID.iToD  = iToD;
+   i->ARMin.VCvtID.syned = syned;
+   i->ARMin.VCvtID.dst   = dst;
+   i->ARMin.VCvtID.src   = src;
+   return i;
+}
+ARMInstr* ARMInstr_FPSCR ( Bool toFPSCR, HReg iReg ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                 = ARMin_FPSCR;
+   i->ARMin.FPSCR.toFPSCR = toFPSCR;
+   i->ARMin.FPSCR.iReg    = iReg;
+   return i;
+}
+ARMInstr* ARMInstr_MFence ( void ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag      = ARMin_MFence;
+   return i;
+}
+
+ARMInstr* ARMInstr_NLdStQ ( Bool isLoad, HReg dQ, ARMAModeN *amode ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                  = ARMin_NLdStQ;
+   i->ARMin.NLdStQ.isLoad  = isLoad;
+   i->ARMin.NLdStQ.dQ      = dQ;
+   i->ARMin.NLdStQ.amode   = amode;
+   return i;
+}
+
+ARMInstr* ARMInstr_NLdStD ( Bool isLoad, HReg dD, ARMAModeN *amode ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                  = ARMin_NLdStD;
+   i->ARMin.NLdStD.isLoad  = isLoad;
+   i->ARMin.NLdStD.dD      = dD;
+   i->ARMin.NLdStD.amode   = amode;
+   return i;
+}
+
+ARMInstr* ARMInstr_NUnary ( ARMNeonUnOp op, HReg dQ, HReg nQ,
+                            UInt size, Bool Q ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                = ARMin_NUnary;
+   i->ARMin.NUnary.op   = op;
+   i->ARMin.NUnary.src  = nQ;
+   i->ARMin.NUnary.dst  = dQ;
+   i->ARMin.NUnary.size = size;
+   i->ARMin.NUnary.Q    = Q;
+   return i;
+}
+
+ARMInstr* ARMInstr_NUnaryS ( ARMNeonUnOp op, ARMNRS* dst, ARMNRS* src,
+                             UInt size, Bool Q ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                = ARMin_NUnaryS;
+   i->ARMin.NUnaryS.op   = op;
+   i->ARMin.NUnaryS.src  = src;
+   i->ARMin.NUnaryS.dst  = dst;
+   i->ARMin.NUnaryS.size = size;
+   i->ARMin.NUnaryS.Q    = Q;
+   return i;
+}
+
+ARMInstr* ARMInstr_NDual ( ARMNeonDualOp op, HReg nQ, HReg mQ,
+                           UInt size, Bool Q ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                = ARMin_NDual;
+   i->ARMin.NDual.op   = op;
+   i->ARMin.NDual.arg1 = nQ;
+   i->ARMin.NDual.arg2 = mQ;
+   i->ARMin.NDual.size = size;
+   i->ARMin.NDual.Q    = Q;
+   return i;
+}
+
+ARMInstr* ARMInstr_NBinary ( ARMNeonBinOp op,
+                             HReg dst, HReg argL, HReg argR,
+                             UInt size, Bool Q ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                = ARMin_NBinary;
+   i->ARMin.NBinary.op   = op;
+   i->ARMin.NBinary.argL = argL;
+   i->ARMin.NBinary.argR = argR;
+   i->ARMin.NBinary.dst  = dst;
+   i->ARMin.NBinary.size = size;
+   i->ARMin.NBinary.Q    = Q;
+   return i;
+}
+
+ARMInstr* ARMInstr_NeonImm (HReg dst, ARMNImm* imm ) {
+   ARMInstr *i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag         = ARMin_NeonImm;
+   i->ARMin.NeonImm.dst = dst;
+   i->ARMin.NeonImm.imm = imm;
+   return i;
+}
+
+ARMInstr* ARMInstr_NCMovQ ( ARMCondCode cond, HReg dst, HReg src ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag               = ARMin_NCMovQ;
+   i->ARMin.NCMovQ.cond = cond;
+   i->ARMin.NCMovQ.dst  = dst;
+   i->ARMin.NCMovQ.src  = src;
+   vassert(cond != ARMcc_AL);
+   return i;
+}
+
+ARMInstr* ARMInstr_NShift ( ARMNeonShiftOp op,
+                            HReg dst, HReg argL, HReg argR,
+                            UInt size, Bool Q ) {
+   ARMInstr* i = LibVEX_Alloc(sizeof(ARMInstr));
+   i->tag                = ARMin_NShift;
+   i->ARMin.NShift.op   = op;
+   i->ARMin.NShift.argL = argL;
+   i->ARMin.NShift.argR = argR;
+   i->ARMin.NShift.dst  = dst;
+   i->ARMin.NShift.size = size;
+   i->ARMin.NShift.Q    = Q;
+   return i;
+}
+
+/* Helper copy-pasted from isel.c */
+static Bool fitsIn8x4 ( UInt* u8, UInt* u4, UInt u )
+{
+   UInt i;
+   for (i = 0; i < 16; i++) {
+      if (0 == (u & 0xFFFFFF00)) {
+         *u8 = u;
+         *u4 = i;
+         return True;
+      }
+      u = ROR32(u, 30);
+   }
+   vassert(i == 16);
+   return False;
+}
+
+ARMInstr* ARMInstr_Add32 ( HReg rD, HReg rN, UInt imm32 ) {
+   UInt u8, u4;
+   ARMInstr *i = LibVEX_Alloc(sizeof(ARMInstr));
+   /* Try to generate single ADD if possible */
+   if (fitsIn8x4(&u8, &u4, imm32)) {
+      i->tag            = ARMin_Alu;
+      i->ARMin.Alu.op   = ARMalu_ADD;
+      i->ARMin.Alu.dst  = rD;
+      i->ARMin.Alu.argL = rN;
+      i->ARMin.Alu.argR = ARMRI84_I84(u8, u4);
+   } else {
+      i->tag               = ARMin_Add32;
+      i->ARMin.Add32.rD    = rD;
+      i->ARMin.Add32.rN    = rN;
+      i->ARMin.Add32.imm32 = imm32;
+   }
+   return i;
+}
+
+/* ... */
+
+void ppARMInstr ( ARMInstr* i ) {
+   switch (i->tag) {
+      case ARMin_Alu:
+         vex_printf("%-4s  ", showARMAluOp(i->ARMin.Alu.op));
+         ppHRegARM(i->ARMin.Alu.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.Alu.argL);
+         vex_printf(", ");
+         ppARMRI84(i->ARMin.Alu.argR);
+         return;
+      case ARMin_Shift:
+         vex_printf("%s   ", showARMShiftOp(i->ARMin.Shift.op));
+         ppHRegARM(i->ARMin.Shift.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.Shift.argL);
+         vex_printf(", ");
+         ppARMRI5(i->ARMin.Shift.argR);
+         return;
+      case ARMin_Unary:
+         vex_printf("%s   ", showARMUnaryOp(i->ARMin.Unary.op));
+         ppHRegARM(i->ARMin.Unary.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.Unary.src);
+         return;
+      case ARMin_CmpOrTst:
+         vex_printf("%s   ", i->ARMin.CmpOrTst.isCmp ? "cmp" : "tst");
+         ppHRegARM(i->ARMin.CmpOrTst.argL);
+         vex_printf(", ");
+         ppARMRI84(i->ARMin.CmpOrTst.argR);
+         return;
+      case ARMin_Mov:
+         vex_printf("mov   ");
+         ppHRegARM(i->ARMin.Mov.dst);
+         vex_printf(", ");
+         ppARMRI84(i->ARMin.Mov.src);
+         return;
+      case ARMin_Imm32:
+         vex_printf("imm   ");
+         ppHRegARM(i->ARMin.Imm32.dst);
+         vex_printf(", 0x%x", i->ARMin.Imm32.imm32);
+         return;
+      case ARMin_LdSt32:
+         if (i->ARMin.LdSt32.isLoad) {
+            vex_printf("ldr   ");
+            ppHRegARM(i->ARMin.LdSt32.rD);
+            vex_printf(", ");
+            ppARMAMode1(i->ARMin.LdSt32.amode);
+         } else {
+            vex_printf("str   ");
+            ppARMAMode1(i->ARMin.LdSt32.amode);
+            vex_printf(", ");
+            ppHRegARM(i->ARMin.LdSt32.rD);
+         }
+         return;
+      case ARMin_LdSt16:
+         if (i->ARMin.LdSt16.isLoad) {
+            vex_printf("%s", i->ARMin.LdSt16.signedLoad 
+                                ? "ldrsh " : "ldrh  " );
+            ppHRegARM(i->ARMin.LdSt16.rD);
+            vex_printf(", ");
+            ppARMAMode2(i->ARMin.LdSt16.amode);
+         } else {
+            vex_printf("strh  ");
+            ppARMAMode2(i->ARMin.LdSt16.amode);
+            vex_printf(", ");
+            ppHRegARM(i->ARMin.LdSt16.rD);
+         }
+         return;
+      case ARMin_LdSt8U:
+         if (i->ARMin.LdSt8U.isLoad) {
+            vex_printf("ldrb  ");
+            ppHRegARM(i->ARMin.LdSt8U.rD);
+            vex_printf(", ");
+            ppARMAMode1(i->ARMin.LdSt8U.amode);
+         } else {
+            vex_printf("strb  ");
+            ppARMAMode1(i->ARMin.LdSt8U.amode);
+            vex_printf(", ");
+            ppHRegARM(i->ARMin.LdSt8U.rD);
+         }
+         return;
+      case ARMin_Ld8S:
+         goto unhandled;
+      case ARMin_Goto:
+         if (i->ARMin.Goto.cond != ARMcc_AL) {
+            vex_printf("if (%%cpsr.%s) { ",
+                       showARMCondCode(i->ARMin.Goto.cond));
+         } else {
+            vex_printf("if (1) { ");
+         }
+         if (i->ARMin.Goto.jk != Ijk_Boring
+             && i->ARMin.Goto.jk != Ijk_Call
+             && i->ARMin.Goto.jk != Ijk_Ret) {
+            vex_printf("mov r8, $");
+            ppIRJumpKind(i->ARMin.Goto.jk);
+            vex_printf(" ; ");
+         }
+         vex_printf("mov r0, ");
+         ppHRegARM(i->ARMin.Goto.gnext);
+         vex_printf(" ; bx r14");
+         if (i->ARMin.Goto.cond != ARMcc_AL) {
+            vex_printf(" }");
+         } else {
+            vex_printf(" }");
+         }
+         return;
+      case ARMin_CMov:
+         vex_printf("mov%s ", showARMCondCode(i->ARMin.CMov.cond));
+         ppHRegARM(i->ARMin.CMov.dst);
+         vex_printf(", ");
+         ppARMRI84(i->ARMin.CMov.src);
+         return;
+      case ARMin_Call:
+         vex_printf("call%s  ",
+                    i->ARMin.Call.cond==ARMcc_AL
+                       ? "" : showARMCondCode(i->ARMin.Call.cond));
+         vex_printf("0x%lx [nArgRegs=%d]",
+                    i->ARMin.Call.target, i->ARMin.Call.nArgRegs);
+         return;
+      case ARMin_Mul:
+         vex_printf("%-5s ", showARMMulOp(i->ARMin.Mul.op));
+         if (i->ARMin.Mul.op == ARMmul_PLAIN) {
+            vex_printf("r0, r2, r3");
+         } else {
+            vex_printf("r1:r0, r2, r3");
+         }
+         return;
+      case ARMin_LdrEX:
+         vex_printf("ldrex%s ", i->ARMin.LdrEX.szB == 1 ? "b"
+                                : i->ARMin.LdrEX.szB == 2 ? "h" : "");
+         vex_printf("r0, [r1]");
+         return;
+      case ARMin_StrEX:
+         vex_printf("strex%s ", i->ARMin.StrEX.szB == 1 ? "b"
+                                : i->ARMin.StrEX.szB == 2 ? "h" : "");
+         vex_printf("r0, r1, [r2]");
+         return;
+      case ARMin_VLdStD:
+         if (i->ARMin.VLdStD.isLoad) {
+            vex_printf("fldd  ");
+            ppHRegARM(i->ARMin.VLdStD.dD);
+            vex_printf(", ");
+            ppARMAModeV(i->ARMin.VLdStD.amode);
+         } else {
+            vex_printf("fstd  ");
+            ppARMAModeV(i->ARMin.VLdStD.amode);
+            vex_printf(", ");
+            ppHRegARM(i->ARMin.VLdStD.dD);
+         }
+         return;
+      case ARMin_VLdStS:
+         if (i->ARMin.VLdStS.isLoad) {
+            vex_printf("flds  ");
+            ppHRegARM(i->ARMin.VLdStS.fD);
+            vex_printf(", ");
+            ppARMAModeV(i->ARMin.VLdStS.amode);
+         } else {
+            vex_printf("fsts  ");
+            ppARMAModeV(i->ARMin.VLdStS.amode);
+            vex_printf(", ");
+            ppHRegARM(i->ARMin.VLdStS.fD);
+         }
+         return;
+      case ARMin_VAluD:
+         vex_printf("f%-3sd ", showARMVfpOp(i->ARMin.VAluD.op));
+         ppHRegARM(i->ARMin.VAluD.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.VAluD.argL);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.VAluD.argR);
+         return;
+      case ARMin_VAluS:
+         vex_printf("f%-3ss ", showARMVfpOp(i->ARMin.VAluS.op));
+         ppHRegARM(i->ARMin.VAluS.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.VAluS.argL);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.VAluS.argR);
+         return;
+      case ARMin_VUnaryD:
+         vex_printf("f%-3sd ", showARMVfpUnaryOp(i->ARMin.VUnaryD.op));
+         ppHRegARM(i->ARMin.VUnaryD.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.VUnaryD.src);
+         return;
+      case ARMin_VUnaryS:
+         vex_printf("f%-3ss ", showARMVfpUnaryOp(i->ARMin.VUnaryS.op));
+         ppHRegARM(i->ARMin.VUnaryS.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.VUnaryS.src);
+         return;
+      case ARMin_VCmpD:
+         vex_printf("fcmpd ");
+         ppHRegARM(i->ARMin.VCmpD.argL);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.VCmpD.argR);
+         vex_printf(" ; fmstat");
+         return;
+      case ARMin_VCMovD:
+         vex_printf("fcpyd%s ", showARMCondCode(i->ARMin.VCMovD.cond));
+         ppHRegARM(i->ARMin.VCMovD.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.VCMovD.src);
+         return;
+      case ARMin_VCMovS:
+         vex_printf("fcpys%s ", showARMCondCode(i->ARMin.VCMovS.cond));
+         ppHRegARM(i->ARMin.VCMovS.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.VCMovS.src);
+         return;
+      case ARMin_VCvtSD:
+         vex_printf("fcvt%s ", i->ARMin.VCvtSD.sToD ? "ds" : "sd");
+         ppHRegARM(i->ARMin.VCvtSD.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.VCvtSD.src);
+         return;
+      case ARMin_VXferD:
+         vex_printf("vmov  ");
+         if (i->ARMin.VXferD.toD) {
+            ppHRegARM(i->ARMin.VXferD.dD);
+            vex_printf(", ");
+            ppHRegARM(i->ARMin.VXferD.rLo);
+            vex_printf(", ");
+            ppHRegARM(i->ARMin.VXferD.rHi);
+         } else {
+            ppHRegARM(i->ARMin.VXferD.rLo);
+            vex_printf(", ");
+            ppHRegARM(i->ARMin.VXferD.rHi);
+            vex_printf(", ");
+            ppHRegARM(i->ARMin.VXferD.dD);
+         }
+         return;
+      case ARMin_VXferS:
+         vex_printf("vmov  ");
+         if (i->ARMin.VXferS.toS) {
+            ppHRegARM(i->ARMin.VXferS.fD);
+            vex_printf(", ");
+            ppHRegARM(i->ARMin.VXferS.rLo);
+         } else {
+            ppHRegARM(i->ARMin.VXferS.rLo);
+            vex_printf(", ");
+            ppHRegARM(i->ARMin.VXferS.fD);
+         }
+         return;
+      case ARMin_VCvtID: {
+         HChar* nm = "?";
+         if (i->ARMin.VCvtID.iToD) {
+            nm = i->ARMin.VCvtID.syned ? "fsitod" : "fuitod";
+         } else {
+            nm = i->ARMin.VCvtID.syned ? "ftosid" : "ftouid";
+         }
+         vex_printf("%s ", nm);
+         ppHRegARM(i->ARMin.VCvtID.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.VCvtID.src);
+         return;
+      }
+      case ARMin_FPSCR:
+         if (i->ARMin.FPSCR.toFPSCR) {
+            vex_printf("fmxr  fpscr, ");
+            ppHRegARM(i->ARMin.FPSCR.iReg);
+         } else {
+            vex_printf("fmrx  ");
+            ppHRegARM(i->ARMin.FPSCR.iReg);
+            vex_printf(", fpscr");
+         }
+         return;
+      case ARMin_MFence:
+         vex_printf("mfence (mcr 15,0,r0,c7,c10,4; 15,0,r0,c7,c10,5; "
+                    "15,0,r0,c7,c5,4)");
+         return;
+      case ARMin_NLdStQ:
+         if (i->ARMin.NLdStQ.isLoad)
+            vex_printf("vld1.32 {");
+         else
+            vex_printf("vst1.32 {");
+         ppHRegARM(i->ARMin.NLdStQ.dQ);
+         vex_printf("} ");
+         ppARMAModeN(i->ARMin.NLdStQ.amode);
+         return;
+      case ARMin_NLdStD:
+         if (i->ARMin.NLdStD.isLoad)
+            vex_printf("vld1.32 {");
+         else
+            vex_printf("vst1.32 {");
+         ppHRegARM(i->ARMin.NLdStD.dD);
+         vex_printf("} ");
+         ppARMAModeN(i->ARMin.NLdStD.amode);
+         return;
+      case ARMin_NUnary:
+         vex_printf("%s%s%s  ",
+                    showARMNeonUnOp(i->ARMin.NUnary.op),
+                    showARMNeonUnOpDataType(i->ARMin.NUnary.op),
+                    showARMNeonDataSize(i));
+         ppHRegARM(i->ARMin.NUnary.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.NUnary.src);
+         if (i->ARMin.NUnary.op == ARMneon_EQZ)
+            vex_printf(", #0");
+         if (i->ARMin.NUnary.op == ARMneon_VCVTFtoFixedS ||
+             i->ARMin.NUnary.op == ARMneon_VCVTFtoFixedU ||
+             i->ARMin.NUnary.op == ARMneon_VCVTFixedStoF ||
+             i->ARMin.NUnary.op == ARMneon_VCVTFixedUtoF) {
+            vex_printf(", #%d", i->ARMin.NUnary.size);
+         }
+         if (i->ARMin.NUnary.op == ARMneon_VQSHLNSS ||
+             i->ARMin.NUnary.op == ARMneon_VQSHLNUU ||
+             i->ARMin.NUnary.op == ARMneon_VQSHLNUS) {
+            UInt size;
+            size = i->ARMin.NUnary.size;
+            if (size & 0x40) {
+               vex_printf(", #%d", size - 64);
+            } else if (size & 0x20) {
+               vex_printf(", #%d", size - 32);
+            } else if (size & 0x10) {
+               vex_printf(", #%d", size - 16);
+            } else if (size & 0x08) {
+               vex_printf(", #%d", size - 8);
+            }
+         }
+         return;
+      case ARMin_NUnaryS:
+         vex_printf("%s%s%s  ",
+                    showARMNeonUnOpS(i->ARMin.NUnary.op),
+                    showARMNeonUnOpSDataType(i->ARMin.NUnary.op),
+                    showARMNeonDataSize(i));
+         ppARMNRS(i->ARMin.NUnaryS.dst);
+         vex_printf(", ");
+         ppARMNRS(i->ARMin.NUnaryS.src);
+         return;
+      case ARMin_NShift:
+         vex_printf("%s%s%s  ",
+                    showARMNeonShiftOp(i->ARMin.NShift.op),
+                    showARMNeonShiftOpDataType(i->ARMin.NShift.op),
+                    showARMNeonDataSize(i));
+         ppHRegARM(i->ARMin.NShift.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.NShift.argL);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.NShift.argR);
+         return;
+      case ARMin_NDual:
+         vex_printf("%s%s%s  ",
+                    showARMNeonDualOp(i->ARMin.NDual.op),
+                    showARMNeonDualOpDataType(i->ARMin.NDual.op),
+                    showARMNeonDataSize(i));
+         ppHRegARM(i->ARMin.NDual.arg1);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.NDual.arg2);
+         return;
+      case ARMin_NBinary:
+         vex_printf("%s%s%s",
+                    showARMNeonBinOp(i->ARMin.NBinary.op),
+                    showARMNeonBinOpDataType(i->ARMin.NBinary.op),
+                    showARMNeonDataSize(i));
+         vex_printf("  ");
+         ppHRegARM(i->ARMin.NBinary.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.NBinary.argL);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.NBinary.argR);
+         return;
+      case ARMin_NeonImm:
+         vex_printf("vmov  ");
+         ppHRegARM(i->ARMin.NeonImm.dst);
+         vex_printf(", ");
+         ppARMNImm(i->ARMin.NeonImm.imm);
+         return;
+      case ARMin_NCMovQ:
+         vex_printf("vmov%s ", showARMCondCode(i->ARMin.NCMovQ.cond));
+         ppHRegARM(i->ARMin.NCMovQ.dst);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.NCMovQ.src);
+         return;
+      case ARMin_Add32:
+         vex_printf("add32 ");
+         ppHRegARM(i->ARMin.Add32.rD);
+         vex_printf(", ");
+         ppHRegARM(i->ARMin.Add32.rN);
+         vex_printf(", ");
+         vex_printf("%d", i->ARMin.Add32.imm32);
+         return;
+      default:
+      unhandled:
+         vex_printf("ppARMInstr: unhandled case (tag %d)", (Int)i->tag);
+         vpanic("ppARMInstr(1)");
+         return;
+   }
+}
+
+
+/* --------- Helpers for register allocation. --------- */
+
+void getRegUsage_ARMInstr ( HRegUsage* u, ARMInstr* i, Bool mode64 )
+{
+   vassert(mode64 == False);
+   initHRegUsage(u);
+   switch (i->tag) {
+      case ARMin_Alu:
+         addHRegUse(u, HRmWrite, i->ARMin.Alu.dst);
+         addHRegUse(u, HRmRead, i->ARMin.Alu.argL);
+         addRegUsage_ARMRI84(u, i->ARMin.Alu.argR);
+         return;
+      case ARMin_Shift:
+         addHRegUse(u, HRmWrite, i->ARMin.Shift.dst);
+         addHRegUse(u, HRmRead, i->ARMin.Shift.argL);
+         addRegUsage_ARMRI5(u, i->ARMin.Shift.argR);
+         return;
+      case ARMin_Unary:
+         addHRegUse(u, HRmWrite, i->ARMin.Unary.dst);
+         addHRegUse(u, HRmRead, i->ARMin.Unary.src);
+         return;
+      case ARMin_CmpOrTst:
+         addHRegUse(u, HRmRead, i->ARMin.CmpOrTst.argL);
+         addRegUsage_ARMRI84(u, i->ARMin.CmpOrTst.argR);
+         return;
+      case ARMin_Mov:
+         addHRegUse(u, HRmWrite, i->ARMin.Mov.dst);
+         addRegUsage_ARMRI84(u, i->ARMin.Mov.src);
+         return;
+      case ARMin_Imm32:
+         addHRegUse(u, HRmWrite, i->ARMin.Imm32.dst);
+         return;
+      case ARMin_LdSt32:
+         addRegUsage_ARMAMode1(u, i->ARMin.LdSt32.amode);
+         if (i->ARMin.LdSt32.isLoad) {
+            addHRegUse(u, HRmWrite, i->ARMin.LdSt32.rD);
+         } else {
+            addHRegUse(u, HRmRead, i->ARMin.LdSt32.rD);
+         }
+         return;
+      case ARMin_LdSt16:
+         addRegUsage_ARMAMode2(u, i->ARMin.LdSt16.amode);
+         if (i->ARMin.LdSt16.isLoad) {
+            addHRegUse(u, HRmWrite, i->ARMin.LdSt16.rD);
+         } else {
+            addHRegUse(u, HRmRead, i->ARMin.LdSt16.rD);
+         }
+         return;
+      case ARMin_LdSt8U:
+         addRegUsage_ARMAMode1(u, i->ARMin.LdSt8U.amode);
+         if (i->ARMin.LdSt8U.isLoad) {
+            addHRegUse(u, HRmWrite, i->ARMin.LdSt8U.rD);
+         } else {
+            addHRegUse(u, HRmRead, i->ARMin.LdSt8U.rD);
+         }
+         return;
+      case ARMin_Ld8S:
+         goto unhandled;
+      case ARMin_Goto:
+         /* reads the reg holding the next guest addr */
+         addHRegUse(u, HRmRead, i->ARMin.Goto.gnext);
+         /* writes it to the standard integer return register */
+         addHRegUse(u, HRmWrite, hregARM_R0());
+         /* possibly messes with the baseblock pointer */
+         if (i->ARMin.Goto.jk != Ijk_Boring
+             && i->ARMin.Goto.jk != Ijk_Call
+             && i->ARMin.Goto.jk != Ijk_Ret)
+            /* note, this is irrelevant since r8 is not actually
+               available to the allocator.  But still .. */
+            addHRegUse(u, HRmWrite, hregARM_R8());
+         return;
+      case ARMin_CMov:
+         addHRegUse(u, HRmWrite, i->ARMin.CMov.dst);
+         addHRegUse(u, HRmRead,  i->ARMin.CMov.dst);
+         addRegUsage_ARMRI84(u, i->ARMin.CMov.src);
+         return;
+      case ARMin_Call:
+         /* logic and comments copied/modified from x86 back end */
+         /* This is a bit subtle. */
+         /* First off, claim it trashes all the caller-saved regs
+            which fall within the register allocator's jurisdiction.
+            These I believe to be r0,1,2,3.  If it turns out that r9
+            is also caller-saved, then we'll have to add that here
+            too. */
+         addHRegUse(u, HRmWrite, hregARM_R0());
+         addHRegUse(u, HRmWrite, hregARM_R1());
+         addHRegUse(u, HRmWrite, hregARM_R2());
+         addHRegUse(u, HRmWrite, hregARM_R3());
+         /* Now we have to state any parameter-carrying registers
+            which might be read.  This depends on nArgRegs. */
+         switch (i->ARMin.Call.nArgRegs) {
+            case 4: addHRegUse(u, HRmRead, hregARM_R3()); /*fallthru*/
+            case 3: addHRegUse(u, HRmRead, hregARM_R2()); /*fallthru*/
+            case 2: addHRegUse(u, HRmRead, hregARM_R1()); /*fallthru*/
+            case 1: addHRegUse(u, HRmRead, hregARM_R0()); break;
+            case 0: break;
+            default: vpanic("getRegUsage_ARM:Call:regparms");
+         }
+         /* Finally, there is the issue that the insn trashes a
+            register because the literal target address has to be
+            loaded into a register.  Fortunately, for the nArgRegs=
+            0/1/2/3 case, we can use r0, r1, r2 or r3 respectively, so
+            this does not cause any further damage.  For the
+            nArgRegs=4 case, we'll have to choose another register
+            arbitrarily since all the caller saved regs are used for
+            parameters, and so we might as well choose r11.
+            */
+         if (i->ARMin.Call.nArgRegs == 4)
+            addHRegUse(u, HRmWrite, hregARM_R11());
+         /* Upshot of this is that the assembler really must observe
+            the here-stated convention of which register to use as an
+            address temporary, depending on nArgRegs: 0==r0,
+            1==r1, 2==r2, 3==r3, 4==r11 */
+         return;
+      case ARMin_Mul:
+         addHRegUse(u, HRmRead, hregARM_R2());
+         addHRegUse(u, HRmRead, hregARM_R3());
+         addHRegUse(u, HRmWrite, hregARM_R0());
+         if (i->ARMin.Mul.op != ARMmul_PLAIN)
+            addHRegUse(u, HRmWrite, hregARM_R1());
+         return;
+      case ARMin_LdrEX:
+         addHRegUse(u, HRmWrite, hregARM_R0());
+         addHRegUse(u, HRmRead, hregARM_R1());
+         return;
+      case ARMin_StrEX:
+         addHRegUse(u, HRmWrite, hregARM_R0());
+         addHRegUse(u, HRmRead, hregARM_R1());
+         addHRegUse(u, HRmRead, hregARM_R2());
+         return;
+      case ARMin_VLdStD:
+         addRegUsage_ARMAModeV(u, i->ARMin.VLdStD.amode);
+         if (i->ARMin.VLdStD.isLoad) {
+            addHRegUse(u, HRmWrite, i->ARMin.VLdStD.dD);
+         } else {
+            addHRegUse(u, HRmRead, i->ARMin.VLdStD.dD);
+         }
+         return;
+      case ARMin_VLdStS:
+         addRegUsage_ARMAModeV(u, i->ARMin.VLdStS.amode);
+         if (i->ARMin.VLdStS.isLoad) {
+            addHRegUse(u, HRmWrite, i->ARMin.VLdStS.fD);
+         } else {
+            addHRegUse(u, HRmRead, i->ARMin.VLdStS.fD);
+         }
+         return;
+      case ARMin_VAluD:
+         addHRegUse(u, HRmWrite, i->ARMin.VAluD.dst);
+         addHRegUse(u, HRmRead, i->ARMin.VAluD.argL);
+         addHRegUse(u, HRmRead, i->ARMin.VAluD.argR);
+         return;
+      case ARMin_VAluS:
+         addHRegUse(u, HRmWrite, i->ARMin.VAluS.dst);
+         addHRegUse(u, HRmRead, i->ARMin.VAluS.argL);
+         addHRegUse(u, HRmRead, i->ARMin.VAluS.argR);
+         return;
+      case ARMin_VUnaryD:
+         addHRegUse(u, HRmWrite, i->ARMin.VUnaryD.dst);
+         addHRegUse(u, HRmRead, i->ARMin.VUnaryD.src);
+         return;
+      case ARMin_VUnaryS:
+         addHRegUse(u, HRmWrite, i->ARMin.VUnaryS.dst);
+         addHRegUse(u, HRmRead, i->ARMin.VUnaryS.src);
+         return;
+      case ARMin_VCmpD:
+         addHRegUse(u, HRmRead, i->ARMin.VCmpD.argL);
+         addHRegUse(u, HRmRead, i->ARMin.VCmpD.argR);
+         return;
+      case ARMin_VCMovD:
+         addHRegUse(u, HRmWrite, i->ARMin.VCMovD.dst);
+         addHRegUse(u, HRmRead,  i->ARMin.VCMovD.dst);
+         addHRegUse(u, HRmRead,  i->ARMin.VCMovD.src);
+         return;
+      case ARMin_VCMovS:
+         addHRegUse(u, HRmWrite, i->ARMin.VCMovS.dst);
+         addHRegUse(u, HRmRead,  i->ARMin.VCMovS.dst);
+         addHRegUse(u, HRmRead,  i->ARMin.VCMovS.src);
+         return;
+      case ARMin_VCvtSD:
+         addHRegUse(u, HRmWrite, i->ARMin.VCvtSD.dst);
+         addHRegUse(u, HRmRead,  i->ARMin.VCvtSD.src);
+         return;
+      case ARMin_VXferD:
+         if (i->ARMin.VXferD.toD) {
+            addHRegUse(u, HRmWrite, i->ARMin.VXferD.dD);
+            addHRegUse(u, HRmRead,  i->ARMin.VXferD.rHi);
+            addHRegUse(u, HRmRead,  i->ARMin.VXferD.rLo);
+         } else {
+            addHRegUse(u, HRmRead,  i->ARMin.VXferD.dD);
+            addHRegUse(u, HRmWrite, i->ARMin.VXferD.rHi);
+            addHRegUse(u, HRmWrite, i->ARMin.VXferD.rLo);
+         }
+         return;
+      case ARMin_VXferS:
+         if (i->ARMin.VXferS.toS) {
+            addHRegUse(u, HRmWrite, i->ARMin.VXferS.fD);
+            addHRegUse(u, HRmRead,  i->ARMin.VXferS.rLo);
+         } else {
+            addHRegUse(u, HRmRead,  i->ARMin.VXferS.fD);
+            addHRegUse(u, HRmWrite, i->ARMin.VXferS.rLo);
+         }
+         return;
+      case ARMin_VCvtID:
+         addHRegUse(u, HRmWrite, i->ARMin.VCvtID.dst);
+         addHRegUse(u, HRmRead,  i->ARMin.VCvtID.src);
+         return;
+      case ARMin_FPSCR:
+         if (i->ARMin.FPSCR.toFPSCR)
+            addHRegUse(u, HRmRead, i->ARMin.FPSCR.iReg);
+         else
+            addHRegUse(u, HRmWrite, i->ARMin.FPSCR.iReg);
+         return;
+      case ARMin_MFence:
+         return;
+      case ARMin_NLdStQ:
+         if (i->ARMin.NLdStQ.isLoad)
+            addHRegUse(u, HRmWrite, i->ARMin.NLdStQ.dQ);
+         else
+            addHRegUse(u, HRmRead, i->ARMin.NLdStQ.dQ);
+         addRegUsage_ARMAModeN(u, i->ARMin.NLdStQ.amode);
+         return;
+      case ARMin_NLdStD:
+         if (i->ARMin.NLdStD.isLoad)
+            addHRegUse(u, HRmWrite, i->ARMin.NLdStD.dD);
+         else
+            addHRegUse(u, HRmRead, i->ARMin.NLdStD.dD);
+         addRegUsage_ARMAModeN(u, i->ARMin.NLdStD.amode);
+         return;
+      case ARMin_NUnary:
+         addHRegUse(u, HRmWrite, i->ARMin.NUnary.dst);
+         addHRegUse(u, HRmRead, i->ARMin.NUnary.src);
+         return;
+      case ARMin_NUnaryS:
+         addHRegUse(u, HRmWrite, i->ARMin.NUnaryS.dst->reg);
+         addHRegUse(u, HRmRead, i->ARMin.NUnaryS.src->reg);
+         return;
+      case ARMin_NShift:
+         addHRegUse(u, HRmWrite, i->ARMin.NShift.dst);
+         addHRegUse(u, HRmRead, i->ARMin.NShift.argL);
+         addHRegUse(u, HRmRead, i->ARMin.NShift.argR);
+         return;
+      case ARMin_NDual:
+         addHRegUse(u, HRmWrite, i->ARMin.NDual.arg1);
+         addHRegUse(u, HRmWrite, i->ARMin.NDual.arg2);
+         addHRegUse(u, HRmRead, i->ARMin.NDual.arg1);
+         addHRegUse(u, HRmRead, i->ARMin.NDual.arg2);
+         return;
+      case ARMin_NBinary:
+         addHRegUse(u, HRmWrite, i->ARMin.NBinary.dst);
+         /* TODO: sometimes dst is also being read! */
+         // XXX fix this
+         addHRegUse(u, HRmRead, i->ARMin.NBinary.argL);
+         addHRegUse(u, HRmRead, i->ARMin.NBinary.argR);
+         return;
+      case ARMin_NeonImm:
+         addHRegUse(u, HRmWrite, i->ARMin.NeonImm.dst);
+         return;
+      case ARMin_NCMovQ:
+         addHRegUse(u, HRmWrite, i->ARMin.NCMovQ.dst);
+         addHRegUse(u, HRmRead,  i->ARMin.NCMovQ.dst);
+         addHRegUse(u, HRmRead,  i->ARMin.NCMovQ.src);
+         return;
+      case ARMin_Add32:
+         addHRegUse(u, HRmWrite, i->ARMin.Add32.rD);
+         addHRegUse(u, HRmRead, i->ARMin.Add32.rN);
+         return;
+      unhandled:
+      default:
+         ppARMInstr(i);
+         vpanic("getRegUsage_ARMInstr");
+   }
+}
+
+
+void mapRegs_ARMInstr ( HRegRemap* m, ARMInstr* i, Bool mode64 )
+{
+   vassert(mode64 == False);
+   switch (i->tag) {
+      case ARMin_Alu:
+         i->ARMin.Alu.dst = lookupHRegRemap(m, i->ARMin.Alu.dst);
+         i->ARMin.Alu.argL = lookupHRegRemap(m, i->ARMin.Alu.argL);
+         mapRegs_ARMRI84(m, i->ARMin.Alu.argR);
+         return;
+      case ARMin_Shift:
+         i->ARMin.Shift.dst = lookupHRegRemap(m, i->ARMin.Shift.dst);
+         i->ARMin.Shift.argL = lookupHRegRemap(m, i->ARMin.Shift.argL);
+         mapRegs_ARMRI5(m, i->ARMin.Shift.argR);
+         return;
+      case ARMin_Unary:
+         i->ARMin.Unary.dst = lookupHRegRemap(m, i->ARMin.Unary.dst);
+         i->ARMin.Unary.src = lookupHRegRemap(m, i->ARMin.Unary.src);
+         return;
+      case ARMin_CmpOrTst:
+         i->ARMin.CmpOrTst.argL = lookupHRegRemap(m, i->ARMin.CmpOrTst.argL);
+         mapRegs_ARMRI84(m, i->ARMin.CmpOrTst.argR);
+         return;
+      case ARMin_Mov:
+         i->ARMin.Mov.dst = lookupHRegRemap(m, i->ARMin.Mov.dst);
+         mapRegs_ARMRI84(m, i->ARMin.Mov.src);
+         return;
+      case ARMin_Imm32:
+         i->ARMin.Imm32.dst = lookupHRegRemap(m, i->ARMin.Imm32.dst);
+         return;
+      case ARMin_LdSt32:
+         i->ARMin.LdSt32.rD = lookupHRegRemap(m, i->ARMin.LdSt32.rD);
+         mapRegs_ARMAMode1(m, i->ARMin.LdSt32.amode);
+         return;
+      case ARMin_LdSt16:
+         i->ARMin.LdSt16.rD = lookupHRegRemap(m, i->ARMin.LdSt16.rD);
+         mapRegs_ARMAMode2(m, i->ARMin.LdSt16.amode);
+         return;
+      case ARMin_LdSt8U:
+         i->ARMin.LdSt8U.rD = lookupHRegRemap(m, i->ARMin.LdSt8U.rD);
+         mapRegs_ARMAMode1(m, i->ARMin.LdSt8U.amode);
+         return;
+      case ARMin_Ld8S:
+         goto unhandled;
+      case ARMin_Goto:
+         i->ARMin.Goto.gnext = lookupHRegRemap(m, i->ARMin.Goto.gnext);
+         return;
+      case ARMin_CMov:
+         i->ARMin.CMov.dst = lookupHRegRemap(m, i->ARMin.CMov.dst);
+         mapRegs_ARMRI84(m, i->ARMin.CMov.src);
+         return;
+      case ARMin_Call:
+         return;
+      case ARMin_Mul:
+         return;
+      case ARMin_LdrEX:
+         return;
+      case ARMin_StrEX:
+         return;
+      case ARMin_VLdStD:
+         i->ARMin.VLdStD.dD = lookupHRegRemap(m, i->ARMin.VLdStD.dD);
+         mapRegs_ARMAModeV(m, i->ARMin.VLdStD.amode);
+         return;
+      case ARMin_VLdStS:
+         i->ARMin.VLdStS.fD = lookupHRegRemap(m, i->ARMin.VLdStS.fD);
+         mapRegs_ARMAModeV(m, i->ARMin.VLdStS.amode);
+         return;
+      case ARMin_VAluD:
+         i->ARMin.VAluD.dst  = lookupHRegRemap(m, i->ARMin.VAluD.dst);
+         i->ARMin.VAluD.argL = lookupHRegRemap(m, i->ARMin.VAluD.argL);
+         i->ARMin.VAluD.argR = lookupHRegRemap(m, i->ARMin.VAluD.argR);
+         return;
+      case ARMin_VAluS:
+         i->ARMin.VAluS.dst  = lookupHRegRemap(m, i->ARMin.VAluS.dst);
+         i->ARMin.VAluS.argL = lookupHRegRemap(m, i->ARMin.VAluS.argL);
+         i->ARMin.VAluS.argR = lookupHRegRemap(m, i->ARMin.VAluS.argR);
+         return;
+      case ARMin_VUnaryD:
+         i->ARMin.VUnaryD.dst = lookupHRegRemap(m, i->ARMin.VUnaryD.dst);
+         i->ARMin.VUnaryD.src = lookupHRegRemap(m, i->ARMin.VUnaryD.src);
+         return;
+      case ARMin_VUnaryS:
+         i->ARMin.VUnaryS.dst = lookupHRegRemap(m, i->ARMin.VUnaryS.dst);
+         i->ARMin.VUnaryS.src = lookupHRegRemap(m, i->ARMin.VUnaryS.src);
+         return;
+      case ARMin_VCmpD:
+         i->ARMin.VCmpD.argL = lookupHRegRemap(m, i->ARMin.VCmpD.argL);
+         i->ARMin.VCmpD.argR = lookupHRegRemap(m, i->ARMin.VCmpD.argR);
+         return;
+      case ARMin_VCMovD:
+         i->ARMin.VCMovD.dst = lookupHRegRemap(m, i->ARMin.VCMovD.dst);
+         i->ARMin.VCMovD.src = lookupHRegRemap(m, i->ARMin.VCMovD.src);
+         return;
+      case ARMin_VCMovS:
+         i->ARMin.VCMovS.dst = lookupHRegRemap(m, i->ARMin.VCMovS.dst);
+         i->ARMin.VCMovS.src = lookupHRegRemap(m, i->ARMin.VCMovS.src);
+         return;
+      case ARMin_VCvtSD:
+         i->ARMin.VCvtSD.dst = lookupHRegRemap(m, i->ARMin.VCvtSD.dst);
+         i->ARMin.VCvtSD.src = lookupHRegRemap(m, i->ARMin.VCvtSD.src);
+         return;
+      case ARMin_VXferD:
+         i->ARMin.VXferD.dD  = lookupHRegRemap(m, i->ARMin.VXferD.dD);
+         i->ARMin.VXferD.rHi = lookupHRegRemap(m, i->ARMin.VXferD.rHi);
+         i->ARMin.VXferD.rLo = lookupHRegRemap(m, i->ARMin.VXferD.rLo);
+         return;
+      case ARMin_VXferS:
+         i->ARMin.VXferS.fD  = lookupHRegRemap(m, i->ARMin.VXferS.fD);
+         i->ARMin.VXferS.rLo = lookupHRegRemap(m, i->ARMin.VXferS.rLo);
+         return;
+      case ARMin_VCvtID:
+         i->ARMin.VCvtID.dst = lookupHRegRemap(m, i->ARMin.VCvtID.dst);
+         i->ARMin.VCvtID.src = lookupHRegRemap(m, i->ARMin.VCvtID.src);
+         return;
+      case ARMin_FPSCR:
+         i->ARMin.FPSCR.iReg = lookupHRegRemap(m, i->ARMin.FPSCR.iReg);
+         return;
+      case ARMin_MFence:
+         return;
+      case ARMin_NLdStQ:
+         i->ARMin.NLdStQ.dQ = lookupHRegRemap(m, i->ARMin.NLdStQ.dQ);
+         mapRegs_ARMAModeN(m, i->ARMin.NLdStQ.amode);
+         return;
+      case ARMin_NLdStD:
+         i->ARMin.NLdStD.dD = lookupHRegRemap(m, i->ARMin.NLdStD.dD);
+         mapRegs_ARMAModeN(m, i->ARMin.NLdStD.amode);
+         return;
+      case ARMin_NUnary:
+         i->ARMin.NUnary.src = lookupHRegRemap(m, i->ARMin.NUnary.src);
+         i->ARMin.NUnary.dst = lookupHRegRemap(m, i->ARMin.NUnary.dst);
+         return;
+      case ARMin_NUnaryS:
+         i->ARMin.NUnaryS.src->reg
+            = lookupHRegRemap(m, i->ARMin.NUnaryS.src->reg);
+         i->ARMin.NUnaryS.dst->reg
+            = lookupHRegRemap(m, i->ARMin.NUnaryS.dst->reg);
+         return;
+      case ARMin_NShift:
+         i->ARMin.NShift.dst = lookupHRegRemap(m, i->ARMin.NShift.dst);
+         i->ARMin.NShift.argL = lookupHRegRemap(m, i->ARMin.NShift.argL);
+         i->ARMin.NShift.argR = lookupHRegRemap(m, i->ARMin.NShift.argR);
+         return;
+      case ARMin_NDual:
+         i->ARMin.NDual.arg1 = lookupHRegRemap(m, i->ARMin.NDual.arg1);
+         i->ARMin.NDual.arg2 = lookupHRegRemap(m, i->ARMin.NDual.arg2);
+         return;
+      case ARMin_NBinary:
+         i->ARMin.NBinary.argL = lookupHRegRemap(m, i->ARMin.NBinary.argL);
+         i->ARMin.NBinary.argR = lookupHRegRemap(m, i->ARMin.NBinary.argR);
+         i->ARMin.NBinary.dst  = lookupHRegRemap(m, i->ARMin.NBinary.dst);
+         return;
+      case ARMin_NeonImm:
+         i->ARMin.NeonImm.dst = lookupHRegRemap(m, i->ARMin.NeonImm.dst);
+         return;
+      case ARMin_NCMovQ:
+         i->ARMin.NCMovQ.dst = lookupHRegRemap(m, i->ARMin.NCMovQ.dst);
+         i->ARMin.NCMovQ.src = lookupHRegRemap(m, i->ARMin.NCMovQ.src);
+         return;
+      case ARMin_Add32:
+         i->ARMin.Add32.rD = lookupHRegRemap(m, i->ARMin.Add32.rD);
+         i->ARMin.Add32.rN = lookupHRegRemap(m, i->ARMin.Add32.rN);
+      unhandled:
+      default:
+         ppARMInstr(i);
+         vpanic("mapRegs_ARMInstr");
+   }
+}
+
+/* Figure out if i represents a reg-reg move, and if so assign the
+   source and destination to *src and *dst.  If in doubt say No.  Used
+   by the register allocator to do move coalescing. 
+*/
+Bool isMove_ARMInstr ( ARMInstr* i, HReg* src, HReg* dst )
+{
+   /* Moves between integer regs */
+   switch (i->tag) {
+      case ARMin_Mov:
+         if (i->ARMin.Mov.src->tag == ARMri84_R) {
+            *src = i->ARMin.Mov.src->ARMri84.R.reg;
+            *dst = i->ARMin.Mov.dst;
+            return True;
+         }
+         break;
+      case ARMin_VUnaryD:
+         if (i->ARMin.VUnaryD.op == ARMvfpu_COPY) {
+            *src = i->ARMin.VUnaryD.src;
+            *dst = i->ARMin.VUnaryD.dst;
+            return True;
+         }
+         break;
+      case ARMin_VUnaryS:
+         if (i->ARMin.VUnaryS.op == ARMvfpu_COPY) {
+            *src = i->ARMin.VUnaryS.src;
+            *dst = i->ARMin.VUnaryS.dst;
+            return True;
+         }
+         break;
+      default:
+         break;
+   }
+
+   // todo: float, vector moves
+   return False;
+}
+
+
+/* Generate arm spill/reload instructions under the direction of the
+   register allocator.  Note it's critical these don't write the
+   condition codes. */
+
+void genSpill_ARM ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                    HReg rreg, Int offsetB, Bool mode64 )
+{
+   HRegClass rclass;
+   vassert(offsetB >= 0);
+   vassert(!hregIsVirtual(rreg));
+   vassert(mode64 == False);
+   *i1 = *i2 = NULL;
+   rclass = hregClass(rreg);
+   switch (rclass) {
+      case HRcInt32:
+         vassert(offsetB <= 4095);
+         *i1 = ARMInstr_LdSt32( False/*!isLoad*/, 
+                                rreg, 
+                                ARMAMode1_RI(hregARM_R8(), offsetB) );
+         return;
+      case HRcFlt32:
+      case HRcFlt64: {
+         HReg r8   = hregARM_R8();  /* baseblock */
+         HReg r12  = hregARM_R12(); /* spill temp */
+         HReg base = r8;
+         vassert(0 == (offsetB & 3));
+         if (offsetB >= 1024) {
+            Int offsetKB = offsetB / 1024;
+            /* r12 = r8 + (1024 * offsetKB) */
+            *i1 = ARMInstr_Alu(ARMalu_ADD, r12, r8,
+                               ARMRI84_I84(offsetKB, 11));
+            offsetB -= (1024 * offsetKB);
+            base = r12;
+         }
+         vassert(offsetB <= 1020);
+         if (rclass == HRcFlt32) {
+            *i2 = ARMInstr_VLdStS( False/*!isLoad*/,
+                                   rreg,
+                                   mkARMAModeV(base, offsetB) );
+         } else {
+            *i2 = ARMInstr_VLdStD( False/*!isLoad*/,
+                                   rreg,
+                                   mkARMAModeV(base, offsetB) );
+         }
+         return;
+      }
+      case HRcVec128: {
+         HReg r8  = hregARM_R8();
+         HReg r12 = hregARM_R12();
+         *i1 = ARMInstr_Add32(r12, r8, offsetB);
+         *i2 = ARMInstr_NLdStQ(False, rreg, mkARMAModeN_R(r12));
+         return;
+      }
+      default:
+         ppHRegClass(rclass);
+         vpanic("genSpill_ARM: unimplemented regclass");
+   }
+}
+
+void genReload_ARM ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                     HReg rreg, Int offsetB, Bool mode64 )
+{
+   HRegClass rclass;
+   vassert(offsetB >= 0);
+   vassert(!hregIsVirtual(rreg));
+   vassert(mode64 == False);
+   *i1 = *i2 = NULL;
+   rclass = hregClass(rreg);
+   switch (rclass) {
+      case HRcInt32:
+         vassert(offsetB <= 4095);
+         *i1 = ARMInstr_LdSt32( True/*isLoad*/, 
+                                rreg, 
+                                ARMAMode1_RI(hregARM_R8(), offsetB) );
+         return;
+      case HRcFlt32:
+      case HRcFlt64: {
+         HReg r8   = hregARM_R8();  /* baseblock */
+         HReg r12  = hregARM_R12(); /* spill temp */
+         HReg base = r8;
+         vassert(0 == (offsetB & 3));
+         if (offsetB >= 1024) {
+            Int offsetKB = offsetB / 1024;
+            /* r12 = r8 + (1024 * offsetKB) */
+            *i1 = ARMInstr_Alu(ARMalu_ADD, r12, r8,
+                               ARMRI84_I84(offsetKB, 11));
+            offsetB -= (1024 * offsetKB);
+            base = r12;
+         }
+         vassert(offsetB <= 1020);
+         if (rclass == HRcFlt32) {
+            *i2 = ARMInstr_VLdStS( True/*isLoad*/,
+                                   rreg,
+                                   mkARMAModeV(base, offsetB) );
+         } else {
+            *i2 = ARMInstr_VLdStD( True/*isLoad*/,
+                                   rreg,
+                                   mkARMAModeV(base, offsetB) );
+         }
+         return;
+      }
+      case HRcVec128: {
+         HReg r8  = hregARM_R8();
+         HReg r12 = hregARM_R12();
+         *i1 = ARMInstr_Add32(r12, r8, offsetB);
+         *i2 = ARMInstr_NLdStQ(True, rreg, mkARMAModeN_R(r12));
+         return;
+      }
+      default:
+         ppHRegClass(rclass);
+         vpanic("genReload_ARM: unimplemented regclass");
+   }
+}
+
+
+/* Emit an instruction into buf and return the number of bytes used.
+   Note that buf is not the insn's final place, and therefore it is
+   imperative to emit position-independent code. */
+
+static inline UChar iregNo ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcInt32);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 15);
+   return toUChar(n);
+}
+
+static inline UChar dregNo ( HReg r )
+{
+   UInt n;
+   if (hregClass(r) != HRcFlt64)
+      ppHRegClass(hregClass(r));
+   vassert(hregClass(r) == HRcFlt64);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 31);
+   return toUChar(n);
+}
+
+static inline UChar fregNo ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcFlt32);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 31);
+   return toUChar(n);
+}
+
+static inline UChar qregNo ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcVec128);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 15);
+   return toUChar(n);
+}
+
+#define BITS4(zzb3,zzb2,zzb1,zzb0) \
+   (((zzb3) << 3) | ((zzb2) << 2) | ((zzb1) << 1) | (zzb0))
+#define X0000  BITS4(0,0,0,0)
+#define X0001  BITS4(0,0,0,1)
+#define X0010  BITS4(0,0,1,0)
+#define X0011  BITS4(0,0,1,1)
+#define X0100  BITS4(0,1,0,0)
+#define X0101  BITS4(0,1,0,1)
+#define X0110  BITS4(0,1,1,0)
+#define X0111  BITS4(0,1,1,1)
+#define X1000  BITS4(1,0,0,0)
+#define X1001  BITS4(1,0,0,1)
+#define X1010  BITS4(1,0,1,0)
+#define X1011  BITS4(1,0,1,1)
+#define X1100  BITS4(1,1,0,0)
+#define X1101  BITS4(1,1,0,1)
+#define X1110  BITS4(1,1,1,0)
+#define X1111  BITS4(1,1,1,1)
+
+#define XXXXX___(zzx7,zzx6,zzx5,zzx4,zzx3) \
+   ((((zzx7) & 0xF) << 28) | (((zzx6) & 0xF) << 24) |  \
+    (((zzx5) & 0xF) << 20) | (((zzx4) & 0xF) << 16) |  \
+    (((zzx3) & 0xF) << 12))
+
+#define XXXXXX__(zzx7,zzx6,zzx5,zzx4,zzx3,zzx2)        \
+   ((((zzx7) & 0xF) << 28) | (((zzx6) & 0xF) << 24) |  \
+    (((zzx5) & 0xF) << 20) | (((zzx4) & 0xF) << 16) |  \
+    (((zzx3) & 0xF) << 12) | (((zzx2) & 0xF) <<  8))
+
+#define XXXXX__X(zzx7,zzx6,zzx5,zzx4,zzx3,zzx0)        \
+   ((((zzx7) & 0xF) << 28) | (((zzx6) & 0xF) << 24) |  \
+    (((zzx5) & 0xF) << 20) | (((zzx4) & 0xF) << 16) |  \
+    (((zzx3) & 0xF) << 12) | (((zzx0) & 0xF) <<  0))
+
+#define XXX___XX(zzx7,zzx6,zzx5,zzx1,zzx0) \
+  ((((zzx7) & 0xF) << 28) | (((zzx6) & 0xF) << 24) | \
+   (((zzx5) & 0xF) << 20) | (((zzx1) & 0xF) << 4) | \
+   (((zzx0) & 0xF) << 0))
+
+#define XXXXXXXX(zzx7,zzx6,zzx5,zzx4,zzx3,zzx2,zzx1,zzx0)  \
+   ((((zzx7) & 0xF) << 28) | (((zzx6) & 0xF) << 24) |  \
+    (((zzx5) & 0xF) << 20) | (((zzx4) & 0xF) << 16) |  \
+    (((zzx3) & 0xF) << 12) | (((zzx2) & 0xF) <<  8) |  \
+    (((zzx1) & 0xF) <<  4) | (((zzx0) & 0xF) <<  0))
+
+/* Generate a skeletal insn that involves an a RI84 shifter operand.
+   Returns a word which is all zeroes apart from bits 25 and 11..0,
+   since it is those that encode the shifter operand (at least to the
+   extent that we care about it.) */
+static UInt skeletal_RI84 ( ARMRI84* ri )
+{
+   UInt instr;
+   if (ri->tag == ARMri84_I84) {
+      vassert(0 == (ri->ARMri84.I84.imm4 & ~0x0F));
+      vassert(0 == (ri->ARMri84.I84.imm8 & ~0xFF));
+      instr = 1 << 25;
+      instr |= (ri->ARMri84.I84.imm4 << 8);
+      instr |= ri->ARMri84.I84.imm8;
+   } else {
+      instr = 0 << 25;
+      instr |= iregNo(ri->ARMri84.R.reg);
+   }
+   return instr;
+}
+
+/* Ditto for RI5.  Resulting word is zeroes apart from bit 4 and bits
+   11..7. */
+static UInt skeletal_RI5 ( ARMRI5* ri )
+{
+   UInt instr;
+   if (ri->tag == ARMri5_I5) {
+      UInt imm5 = ri->ARMri5.I5.imm5;
+      vassert(imm5 >= 1 && imm5 <= 31);
+      instr = 0 << 4;
+      instr |= imm5 << 7;
+   } else {
+      instr = 1 << 4;
+      instr |= iregNo(ri->ARMri5.R.reg) << 8;
+   }
+   return instr;
+}
+
+
+/* Get an immediate into a register, using only that 
+   register.  (very lame..) */
+static UInt* imm32_to_iregNo ( UInt* p, Int rD, UInt imm32 )
+{
+   UInt instr;
+   vassert(rD >= 0 && rD <= 14); // r15 not good to mess with!
+#if 0
+   if (0 == (imm32 & ~0xFF)) {
+      /* mov with a immediate shifter operand of (0, imm32) (??) */
+      instr = XXXXXX__(X1110,X0011,X1010,X0000,rD,X0000);
+      instr |= imm32;
+      *p++ = instr;
+   } else {
+      // this is very bad; causes Dcache pollution
+      // ldr  rD, [pc]
+      instr = XXXXX___(X1110,X0101,X1001,X1111,rD);
+      *p++ = instr;
+      // b .+8
+      instr = 0xEA000000;
+      *p++ = instr;
+      // .word imm32
+      *p++ = imm32;
+   }
+#else
+   if (VEX_ARM_ARCHLEVEL(arm_hwcaps) > 6) {
+      /* Generate movw rD, #low16.  Then, if the high 16 are
+         nonzero, generate movt rD, #high16. */
+      UInt lo16 = imm32 & 0xFFFF;
+      UInt hi16 = (imm32 >> 16) & 0xFFFF;
+      instr = XXXXXXXX(0xE, 0x3, 0x0, (lo16 >> 12) & 0xF, rD,
+                       (lo16 >> 8) & 0xF, (lo16 >> 4) & 0xF,
+                       lo16 & 0xF);
+      *p++ = instr;
+      if (hi16 != 0) {
+         instr = XXXXXXXX(0xE, 0x3, 0x4, (hi16 >> 12) & 0xF, rD,
+                          (hi16 >> 8) & 0xF, (hi16 >> 4) & 0xF,
+                          hi16 & 0xF);
+         *p++ = instr;
+      }
+   } else {
+      UInt imm, rot;
+      UInt op = X1010;
+      UInt rN = 0;
+      if ((imm32 & 0xFF) || (imm32 == 0)) {
+         imm = imm32 & 0xFF;
+         rot = 0;
+         instr = XXXXXXXX(0xE, 0x3, op, rN, rD, rot, imm >> 4, imm & 0xF);
+         *p++ = instr;
+         op = X1000;
+         rN = rD;
+      }
+      if (imm32 & 0xFF000000) {
+         imm = (imm32 >> 24) & 0xFF;
+         rot = 4;
+         instr = XXXXXXXX(0xE, 0x3, op, rN, rD, rot, imm >> 4, imm & 0xF);
+         *p++ = instr;
+         op = X1000;
+         rN = rD;
+      }
+      if (imm32 & 0xFF0000) {
+         imm = (imm32 >> 16) & 0xFF;
+         rot = 8;
+         instr = XXXXXXXX(0xE, 0x3, op, rN, rD, rot, imm >> 4, imm & 0xF);
+         *p++ = instr;
+         op = X1000;
+         rN = rD;
+      }
+      if (imm32 & 0xFF00) {
+         imm = (imm32 >> 8) & 0xFF;
+         rot = 12;
+         instr = XXXXXXXX(0xE, 0x3, op, rN, rD, rot, imm >> 4, imm & 0xF);
+         *p++ = instr;
+         op = X1000;
+         rN = rD;
+      }
+   }
+#endif
+   return p;
+}
+
+
+Int emit_ARMInstr ( UChar* buf, Int nbuf, ARMInstr* i,
+                    Bool mode64, void* dispatch ) 
+{
+   UInt* p = (UInt*)buf;
+   vassert(nbuf >= 32);
+   vassert(mode64 == False);
+   vassert(0 == (((HWord)buf) & 3));
+   /* since we branch to lr(r13) to get back to dispatch: */
+   vassert(dispatch == NULL);
+
+   switch (i->tag) {
+      case ARMin_Alu: {
+         UInt     instr, subopc;
+         UInt     rD   = iregNo(i->ARMin.Alu.dst);
+         UInt     rN   = iregNo(i->ARMin.Alu.argL);
+         ARMRI84* argR = i->ARMin.Alu.argR;
+         switch (i->ARMin.Alu.op) {
+            case ARMalu_ADDS: /* fallthru */
+            case ARMalu_ADD:  subopc = X0100; break;
+            case ARMalu_ADC:  subopc = X0101; break;
+            case ARMalu_SUBS: /* fallthru */
+            case ARMalu_SUB:  subopc = X0010; break;
+            case ARMalu_SBC:  subopc = X0110; break;
+            case ARMalu_AND:  subopc = X0000; break;
+            case ARMalu_BIC:  subopc = X1110; break;
+            case ARMalu_OR:   subopc = X1100; break;
+            case ARMalu_XOR:  subopc = X0001; break;
+            default: goto bad;
+         }
+         instr = skeletal_RI84(argR);
+         instr |= XXXXX___(X1110, (1 & (subopc >> 3)),
+                           (subopc << 1) & 0xF, rN, rD);
+         if (i->ARMin.Alu.op == ARMalu_ADDS
+             || i->ARMin.Alu.op == ARMalu_SUBS) {
+            instr |= 1<<20;  /* set the S bit */
+         }
+         *p++ = instr;
+         goto done;
+      }
+      case ARMin_Shift: {
+         UInt    instr, subopc;
+         HReg    rD   = iregNo(i->ARMin.Shift.dst);
+         HReg    rM   = iregNo(i->ARMin.Shift.argL);
+         ARMRI5* argR = i->ARMin.Shift.argR;
+         switch (i->ARMin.Shift.op) {
+            case ARMsh_SHL: subopc = X0000; break;
+            case ARMsh_SHR: subopc = X0001; break;
+            case ARMsh_SAR: subopc = X0010; break;
+            default: goto bad;
+         }
+         instr = skeletal_RI5(argR);
+         instr |= XXXXX__X(X1110,X0001,X1010,X0000,rD, /* _ _ */ rM);
+         instr |= (subopc & 3) << 5;
+         *p++ = instr;
+         goto done;
+      }
+      case ARMin_Unary: {
+         UInt instr;
+         HReg rDst = iregNo(i->ARMin.Unary.dst);
+         HReg rSrc = iregNo(i->ARMin.Unary.src);
+         switch (i->ARMin.Unary.op) {
+            case ARMun_CLZ:
+               instr = XXXXXXXX(X1110,X0001,X0110,X1111,
+                                rDst,X1111,X0001,rSrc);
+               *p++ = instr;
+               goto done;
+            case ARMun_NEG: /* RSB rD,rS,#0 */
+               instr = XXXXX___(X1110,0x2,0x6,rSrc,rDst);
+               *p++ = instr;
+               goto done;
+            case ARMun_NOT: {
+               UInt subopc = X1111; /* MVN */
+               instr = rSrc;
+               instr |= XXXXX___(X1110, (1 & (subopc >> 3)),
+                                 (subopc << 1) & 0xF, 0, rDst);
+               *p++ = instr;
+               goto done;
+            }
+            default:
+               break;
+         }
+         goto bad;
+      }
+      case ARMin_CmpOrTst: {
+         UInt instr  = skeletal_RI84(i->ARMin.CmpOrTst.argR);
+         UInt subopc = i->ARMin.CmpOrTst.isCmp ? X1010 : X1000;
+         UInt SBZ    = 0;
+         instr |= XXXXX___(X1110, (1 & (subopc >> 3)),
+                           ((subopc << 1) & 0xF) | 1,
+                           i->ARMin.CmpOrTst.argL, SBZ );
+         *p++ = instr;
+         goto done;
+      }
+      case ARMin_Mov: {
+         UInt instr  = skeletal_RI84(i->ARMin.Mov.src);
+         UInt subopc = X1101; /* MOV */
+         UInt SBZ    = 0;
+         instr |= XXXXX___(X1110, (1 & (subopc >> 3)),
+                           (subopc << 1) & 0xF, SBZ, i->ARMin.Mov.dst);
+         *p++ = instr;
+         goto done;
+      }
+      case ARMin_Imm32: {
+         p = imm32_to_iregNo( (UInt*)p, iregNo(i->ARMin.Imm32.dst),
+                                        i->ARMin.Imm32.imm32 );
+         goto done;
+      }
+      case ARMin_LdSt32:
+      case ARMin_LdSt8U: {
+         UInt       bL, bB;
+         HReg       rD;
+         ARMAMode1* am;
+         if (i->tag == ARMin_LdSt32) {
+            bB = 0;
+            bL = i->ARMin.LdSt32.isLoad ? 1 : 0;
+            am = i->ARMin.LdSt32.amode;
+            rD = i->ARMin.LdSt32.rD;
+         } else {
+            bB = 1;
+            bL = i->ARMin.LdSt8U.isLoad ? 1 : 0;
+            am = i->ARMin.LdSt8U.amode;
+            rD = i->ARMin.LdSt8U.rD;
+         }
+         if (am->tag == ARMam1_RI) {
+            Int  simm12;
+            UInt instr, bP;
+            if (am->ARMam1.RI.simm13 < 0) {
+               bP = 0;
+               simm12 = -am->ARMam1.RI.simm13;
+            } else {
+               bP = 1;
+               simm12 = am->ARMam1.RI.simm13;
+            }
+            vassert(simm12 >= 0 && simm12 <= 4095);
+            instr = XXXXX___(X1110,X0101,BITS4(bP,bB,0,bL),
+                             iregNo(am->ARMam1.RI.reg),
+                             iregNo(rD));
+            instr |= simm12;
+            *p++ = instr;
+            goto done;
+         } else {
+            // RR case
+            goto bad;
+         }
+      }
+      case ARMin_LdSt16: {
+         HReg       rD = i->ARMin.LdSt16.rD;
+         UInt       bS = i->ARMin.LdSt16.signedLoad ? 1 : 0;
+         UInt       bL = i->ARMin.LdSt16.isLoad ? 1 : 0;
+         ARMAMode2* am = i->ARMin.LdSt16.amode;
+         if (am->tag == ARMam2_RI) {
+            HReg rN = am->ARMam2.RI.reg;
+            Int  simm8;
+            UInt bP, imm8hi, imm8lo, instr;
+            if (am->ARMam2.RI.simm9 < 0) {
+               bP = 0;
+               simm8 = -am->ARMam2.RI.simm9;
+            } else {
+               bP = 1;
+               simm8 = am->ARMam2.RI.simm9;
+            }
+            vassert(simm8 >= 0 && simm8 <= 255);
+            imm8hi = (simm8 >> 4) & 0xF;
+            imm8lo = simm8 & 0xF;
+            vassert(!(bL == 0 && bS == 1)); // "! signed store"
+            /**/ if (bL == 0 && bS == 0) {
+               // strh
+               instr = XXXXXXXX(X1110,X0001, BITS4(bP,1,0,0), iregNo(rN),
+                                iregNo(rD), imm8hi, X1011, imm8lo);
+               *p++ = instr;
+               goto done;
+            }
+            else if (bL == 1 && bS == 0) {
+               // ldrh
+               instr = XXXXXXXX(X1110,X0001, BITS4(bP,1,0,1), iregNo(rN),
+                                iregNo(rD), imm8hi, X1011, imm8lo);
+               *p++ = instr;
+               goto done;
+            }
+            else if (bL == 1 && bS == 1) {
+               goto bad;
+            }
+            else vassert(0); // ill-constructed insn
+         } else {
+            // RR case
+            goto bad;
+         }
+      }
+      case ARMin_Ld8S:
+         goto bad;
+      case ARMin_Goto: {
+         UInt        instr;
+         IRJumpKind  jk    = i->ARMin.Goto.jk;
+         ARMCondCode cond  = i->ARMin.Goto.cond;
+         UInt        rnext = iregNo(i->ARMin.Goto.gnext);
+         Int         trc   = -1;
+         switch (jk) {
+            case Ijk_Ret: case Ijk_Call: case Ijk_Boring:
+               break; /* no need to set GST in these common cases */
+            case Ijk_ClientReq:
+               trc = VEX_TRC_JMP_CLIENTREQ; break;
+            case Ijk_Sys_int128:
+            case Ijk_Sys_int129:
+            case Ijk_Sys_int130:
+            case Ijk_Yield:
+            case Ijk_EmWarn:
+            case Ijk_MapFail:
+               goto unhandled_jk;
+            case Ijk_NoDecode:
+               trc = VEX_TRC_JMP_NODECODE; break;
+            case Ijk_TInval:
+               trc = VEX_TRC_JMP_TINVAL; break;
+            case Ijk_NoRedir:
+               trc = VEX_TRC_JMP_NOREDIR; break;
+            case Ijk_Sys_sysenter:
+            case Ijk_SigTRAP:
+            case Ijk_SigSEGV:
+               goto unhandled_jk;
+            case Ijk_Sys_syscall:
+               trc = VEX_TRC_JMP_SYS_SYSCALL; break;
+            unhandled_jk:
+            default:
+               goto bad;
+         }
+         if (trc != -1) {
+            // mov{cond} r8, #trc
+            vassert(trc >= 0 && trc <= 255);
+            instr = (cond << 28) | 0x03A08000 | (0xFF & (UInt)trc);
+            *p++ = instr;
+         }
+         // mov{cond} r0, rnext
+         if (rnext != 0) {
+            instr = (cond << 28) | 0x01A00000 | rnext;
+            *p++ = instr;
+         }
+         // bx{cond} r14
+         instr =(cond << 28) | 0x012FFF1E;
+         *p++ = instr;
+         goto done;
+      }
+      case ARMin_CMov: {
+         UInt instr  = skeletal_RI84(i->ARMin.CMov.src);
+         UInt subopc = X1101; /* MOV */
+         UInt SBZ    = 0;
+         instr |= XXXXX___(i->ARMin.CMov.cond, (1 & (subopc >> 3)),
+                           (subopc << 1) & 0xF, SBZ, i->ARMin.CMov.dst);
+         *p++ = instr;
+         goto done;
+      }
+      case ARMin_Call: {
+         UInt instr;
+         /* Decide on a scratch reg used to hold to the call address.
+            This has to be done as per the comments in getRegUsage. */
+         Int scratchNo;
+         switch (i->ARMin.Call.nArgRegs) {
+            case 0:  scratchNo = 0;  break;
+            case 1:  scratchNo = 1;  break;
+            case 2:  scratchNo = 2;  break;
+            case 3:  scratchNo = 3;  break;
+            case 4:  scratchNo = 11; break;
+            default: vassert(0);
+         }
+         // r"scratchNo" = &target
+         p = imm32_to_iregNo( (UInt*)p,
+                              scratchNo, (UInt)i->ARMin.Call.target );
+         // blx{cond} r"scratchNo"
+         instr = XXX___XX(i->ARMin.Call.cond, X0001, X0010, /*___*/
+                          X0011, scratchNo);
+         instr |= 0xFFF << 8; // stick in the SBOnes
+         *p++ = instr;
+         goto done;
+      }
+      case ARMin_Mul: {
+         /* E0000392   mul     r0, r2, r3
+            E0810392   umull   r0(LO), r1(HI), r2, r3
+            E0C10392   smull   r0(LO), r1(HI), r2, r3
+         */
+         switch (i->ARMin.Mul.op) {
+            case ARMmul_PLAIN: *p++ = 0xE0000392; goto done;
+            case ARMmul_ZX:    *p++ = 0xE0810392; goto done;
+            case ARMmul_SX:    *p++ = 0xE0C10392; goto done;
+            default: vassert(0);
+         }
+         goto bad;
+      }
+      case ARMin_LdrEX: {
+         /* E1910F9F   ldrex    r0, [r1]
+            E1F10F9F   ldrexh   r0, [r1]
+            E1D10F9F   ldrexb   r0, [r1]
+         */
+         switch (i->ARMin.LdrEX.szB) {
+            case 4: *p++ = 0xE1910F9F; goto done;
+            //case 2: *p++ = 0xE1F10F9F; goto done;
+            case 1: *p++ = 0xE1D10F9F; goto done;
+            default: break;
+         }
+         goto bad;
+      }
+      case ARMin_StrEX: {
+         /* E1820F91   strex   r0, r1, [r2]
+            E1E20F91   strexh  r0, r1, [r2]
+            E1C20F91   strexb  r0, r1, [r2]
+         */
+         switch (i->ARMin.StrEX.szB) {
+            case 4: *p++ = 0xE1820F91; goto done;
+            //case 2: *p++ = 0xE1E20F91; goto done;
+            case 1: *p++ = 0xE1C20F91; goto done;
+            default: break;
+         }
+         goto bad;
+      }
+      case ARMin_VLdStD: {
+         UInt dD     = dregNo(i->ARMin.VLdStD.dD);
+         UInt rN     = iregNo(i->ARMin.VLdStD.amode->reg);
+         Int  simm11 = i->ARMin.VLdStD.amode->simm11;
+         UInt off8   = simm11 >= 0 ? simm11 : ((UInt)(-simm11));
+         UInt bU     = simm11 >= 0 ? 1 : 0;
+         UInt bL     = i->ARMin.VLdStD.isLoad ? 1 : 0;
+         UInt insn;
+         vassert(0 == (off8 & 3));
+         off8 >>= 2;
+         vassert(0 == (off8 & 0xFFFFFF00));
+         insn = XXXXXX__(0xE,X1101,BITS4(bU,0,0,bL),rN,dD,X1011);
+         insn |= off8;
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_VLdStS: {
+         UInt fD     = fregNo(i->ARMin.VLdStS.fD);
+         UInt rN     = iregNo(i->ARMin.VLdStS.amode->reg);
+         Int  simm11 = i->ARMin.VLdStS.amode->simm11;
+         UInt off8   = simm11 >= 0 ? simm11 : ((UInt)(-simm11));
+         UInt bU     = simm11 >= 0 ? 1 : 0;
+         UInt bL     = i->ARMin.VLdStS.isLoad ? 1 : 0;
+         UInt bD     = fD & 1;
+         UInt insn;
+         vassert(0 == (off8 & 3));
+         off8 >>= 2;
+         vassert(0 == (off8 & 0xFFFFFF00));
+         insn = XXXXXX__(0xE,X1101,BITS4(bU,bD,0,bL),rN, (fD >> 1), X1010);
+         insn |= off8;
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_VAluD: {
+         UInt dN = dregNo(i->ARMin.VAluD.argL);
+         UInt dD = dregNo(i->ARMin.VAluD.dst);
+         UInt dM = dregNo(i->ARMin.VAluD.argR);
+         UInt pqrs = X1111; /* undefined */
+         switch (i->ARMin.VAluD.op) {
+            case ARMvfp_ADD: pqrs = X0110; break;
+            case ARMvfp_SUB: pqrs = X0111; break;
+            case ARMvfp_MUL: pqrs = X0100; break;
+            case ARMvfp_DIV: pqrs = X1000; break;
+            default: goto bad;
+         }
+         vassert(pqrs != X1111);
+         UInt bP  = (pqrs >> 3) & 1;
+         UInt bQ  = (pqrs >> 2) & 1;
+         UInt bR  = (pqrs >> 1) & 1;
+         UInt bS  = (pqrs >> 0) & 1;
+         UInt insn = XXXXXXXX(0xE, X1110, BITS4(bP,0,bQ,bR), dN, dD,
+                              X1011, BITS4(0,bS,0,0), dM);
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_VAluS: {
+         UInt dN = fregNo(i->ARMin.VAluS.argL);
+         UInt dD = fregNo(i->ARMin.VAluS.dst);
+         UInt dM = fregNo(i->ARMin.VAluS.argR);
+         UInt bN = dN & 1;
+         UInt bD = dD & 1;
+         UInt bM = dM & 1;
+         UInt pqrs = X1111; /* undefined */
+         switch (i->ARMin.VAluS.op) {
+            case ARMvfp_ADD: pqrs = X0110; break;
+            case ARMvfp_SUB: pqrs = X0111; break;
+            case ARMvfp_MUL: pqrs = X0100; break;
+            case ARMvfp_DIV: pqrs = X1000; break;
+            default: goto bad;
+         }
+         vassert(pqrs != X1111);
+         UInt bP  = (pqrs >> 3) & 1;
+         UInt bQ  = (pqrs >> 2) & 1;
+         UInt bR  = (pqrs >> 1) & 1;
+         UInt bS  = (pqrs >> 0) & 1;
+         UInt insn = XXXXXXXX(0xE, X1110, BITS4(bP,bD,bQ,bR),
+                              (dN >> 1), (dD >> 1),
+                              X1010, BITS4(bN,bS,bM,0), (dM >> 1));
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_VUnaryD: {
+         UInt dD   = dregNo(i->ARMin.VUnaryD.dst);
+         UInt dM   = dregNo(i->ARMin.VUnaryD.src);
+         UInt insn = 0;
+         switch (i->ARMin.VUnaryD.op) {
+            case ARMvfpu_COPY:
+               insn = XXXXXXXX(0xE, X1110,X1011,X0000,dD,X1011,X0100,dM);
+               break;
+            case ARMvfpu_ABS:
+               insn = XXXXXXXX(0xE, X1110,X1011,X0000,dD,X1011,X1100,dM);
+               break;
+            case ARMvfpu_NEG:
+               insn = XXXXXXXX(0xE, X1110,X1011,X0001,dD,X1011,X0100,dM);
+               break;
+            case ARMvfpu_SQRT:
+               insn = XXXXXXXX(0xE, X1110,X1011,X0001,dD,X1011,X1100,dM);
+               break;
+            default:
+               goto bad;
+         }
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_VUnaryS: {
+         UInt fD   = fregNo(i->ARMin.VUnaryS.dst);
+         UInt fM   = fregNo(i->ARMin.VUnaryS.src);
+         UInt insn = 0;
+         switch (i->ARMin.VUnaryS.op) {
+            case ARMvfpu_COPY:
+               insn = XXXXXXXX(0xE, X1110, BITS4(1,(fD & 1),1,1), X0000,
+                               (fD >> 1), X1010, BITS4(0,1,(fM & 1),0),
+                               (fM >> 1));
+               break;
+            case ARMvfpu_ABS:
+               insn = XXXXXXXX(0xE, X1110, BITS4(1,(fD & 1),1,1), X0000,
+                               (fD >> 1), X1010, BITS4(1,1,(fM & 1),0),
+                               (fM >> 1));
+               break;
+            case ARMvfpu_NEG:
+               insn = XXXXXXXX(0xE, X1110, BITS4(1,(fD & 1),1,1), X0001,
+                               (fD >> 1), X1010, BITS4(0,1,(fM & 1),0),
+                               (fM >> 1));
+               break;
+            case ARMvfpu_SQRT:
+               insn = XXXXXXXX(0xE, X1110, BITS4(1,(fD & 1),1,1), X0001,
+                               (fD >> 1), X1010, BITS4(1,1,(fM & 1),0),
+                               (fM >> 1));
+               break;
+            default:
+               goto bad;
+         }
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_VCmpD: {
+         UInt dD   = dregNo(i->ARMin.VCmpD.argL);
+         UInt dM   = dregNo(i->ARMin.VCmpD.argR);
+         UInt insn = XXXXXXXX(0xE, X1110, X1011, X0100, dD, X1011, X0100, dM);
+         *p++ = insn;       /* FCMPD dD, dM */
+         *p++ = 0xEEF1FA10; /* FMSTAT */
+         goto done;
+      }
+      case ARMin_VCMovD: {
+         UInt cc = (UInt)i->ARMin.VCMovD.cond;
+         UInt dD = dregNo(i->ARMin.VCMovD.dst);
+         UInt dM = dregNo(i->ARMin.VCMovD.src);
+         vassert(cc < 16 && cc != ARMcc_AL);
+         UInt insn = XXXXXXXX(cc, X1110,X1011,X0000,dD,X1011,X0100,dM);
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_VCMovS: {
+         UInt cc = (UInt)i->ARMin.VCMovS.cond;
+         UInt fD = fregNo(i->ARMin.VCMovS.dst);
+         UInt fM = fregNo(i->ARMin.VCMovS.src);
+         vassert(cc < 16 && cc != ARMcc_AL);
+         UInt insn = XXXXXXXX(cc, X1110, BITS4(1,(fD & 1),1,1),
+                              X0000,(fD >> 1),X1010,
+                              BITS4(0,1,(fM & 1),0), (fM >> 1));
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_VCvtSD: {
+         if (i->ARMin.VCvtSD.sToD) {
+            UInt dD = dregNo(i->ARMin.VCvtSD.dst);
+            UInt fM = fregNo(i->ARMin.VCvtSD.src);
+            UInt insn = XXXXXXXX(0xE, X1110, X1011, X0111, dD, X1010,
+                                 BITS4(1,1, (fM & 1), 0),
+                                 (fM >> 1));
+            *p++ = insn;
+            goto done;
+         } else {
+            UInt fD = fregNo(i->ARMin.VCvtSD.dst);
+            UInt dM = dregNo(i->ARMin.VCvtSD.src);
+            UInt insn = XXXXXXXX(0xE, X1110, BITS4(1,(fD & 1),1,1),
+                                 X0111, (fD >> 1),
+                                 X1011, X1100, dM);
+            *p++ = insn;
+            goto done;
+         }
+         goto bad;
+      }
+      case ARMin_VXferD: {
+         UInt dD  = dregNo(i->ARMin.VXferD.dD);
+         UInt rHi = iregNo(i->ARMin.VXferD.rHi);
+         UInt rLo = iregNo(i->ARMin.VXferD.rLo);
+         /* vmov dD, rLo, rHi is
+            E C 4 rHi rLo B (0,0,dD[4],1) dD[3:0]
+            vmov rLo, rHi, dD is
+            E C 5 rHi rLo B (0,0,dD[4],1) dD[3:0]
+         */
+         UInt insn
+            = XXXXXXXX(0xE, 0xC, i->ARMin.VXferD.toD ? 4 : 5,
+                       rHi, rLo, 0xB,
+                       BITS4(0,0, ((dD >> 4) & 1), 1), (dD & 0xF));
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_VXferS: {
+         UInt fD  = fregNo(i->ARMin.VXferS.fD);
+         UInt rLo = iregNo(i->ARMin.VXferS.rLo);
+         /* vmov fD, rLo is
+            E E 0 fD[4:1] rLo A (fD[0],0,0,1) 0
+            vmov rLo, fD is
+            E E 1 fD[4:1] rLo A (fD[0],0,0,1) 0
+         */
+         UInt insn
+            = XXXXXXXX(0xE, 0xE, i->ARMin.VXferS.toS ? 0 : 1,
+                       (fD >> 1) & 0xF, rLo, 0xA, 
+                       BITS4((fD & 1),0,0,1), 0);
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_VCvtID: {
+         Bool iToD = i->ARMin.VCvtID.iToD;
+         Bool syned = i->ARMin.VCvtID.syned;
+         if (iToD && syned) {
+            // FSITOD: I32S-in-freg to F64-in-dreg
+            UInt regF = fregNo(i->ARMin.VCvtID.src);
+            UInt regD = dregNo(i->ARMin.VCvtID.dst);
+            UInt insn = XXXXXXXX(0xE, X1110, X1011, X1000, regD,
+                                 X1011, BITS4(1,1,(regF & 1),0),
+                                 (regF >> 1) & 0xF);
+            *p++ = insn;
+            goto done;
+         }
+         if (iToD && (!syned)) {
+            // FUITOD: I32U-in-freg to F64-in-dreg
+            UInt regF = fregNo(i->ARMin.VCvtID.src);
+            UInt regD = dregNo(i->ARMin.VCvtID.dst);
+            UInt insn = XXXXXXXX(0xE, X1110, X1011, X1000, regD,
+                                 X1011, BITS4(0,1,(regF & 1),0),
+                                 (regF >> 1) & 0xF);
+            *p++ = insn;
+            goto done;
+         }
+         if ((!iToD) && syned) {
+            // FTOSID: F64-in-dreg to I32S-in-freg
+            UInt regD = dregNo(i->ARMin.VCvtID.src);
+            UInt regF = fregNo(i->ARMin.VCvtID.dst);
+            UInt insn = XXXXXXXX(0xE, X1110, BITS4(1,(regF & 1),1,1),
+                                 X1101, (regF >> 1) & 0xF,
+                                 X1011, X0100, regD);
+            *p++ = insn;
+            goto done;
+         }
+         if ((!iToD) && (!syned)) {
+            // FTOUID: F64-in-dreg to I32U-in-freg
+            UInt regD = dregNo(i->ARMin.VCvtID.src);
+            UInt regF = fregNo(i->ARMin.VCvtID.dst);
+            UInt insn = XXXXXXXX(0xE, X1110, BITS4(1,(regF & 1),1,1),
+                                 X1100, (regF >> 1) & 0xF,
+                                 X1011, X0100, regD);
+            *p++ = insn;
+            goto done;
+         }
+         /*UNREACHED*/
+         vassert(0);
+      }
+      case ARMin_FPSCR: {
+         Bool toFPSCR = i->ARMin.FPSCR.toFPSCR;
+         HReg iReg    = iregNo(i->ARMin.FPSCR.iReg);
+         if (toFPSCR) {
+            /* fmxr fpscr, iReg is EEE1 iReg A10 */
+            *p++ = 0xEEE10A10 | ((iReg & 0xF) << 12);
+            goto done;
+         }
+         goto bad; // FPSCR -> iReg case currently ATC
+      }
+      case ARMin_MFence: {
+         *p++ = 0xEE070F9A; /* mcr 15,0,r0,c7,c10,4 (DSB) */
+         *p++ = 0xEE070FBA; /* mcr 15,0,r0,c7,c10,5 (DMB) */
+         *p++ = 0xEE070F95; /* mcr 15,0,r0,c7,c5,4  (ISB) */
+         goto done;
+      }
+      case ARMin_NLdStQ: {
+         UInt regD = qregNo(i->ARMin.NLdStQ.dQ) << 1;
+         UInt regN, regM;
+         UInt D = regD >> 4;
+         UInt bL = i->ARMin.NLdStQ.isLoad ? 1 : 0;
+         UInt insn;
+         vassert(hregClass(i->ARMin.NLdStQ.dQ) == HRcVec128);
+         regD &= 0xF;
+         if (i->ARMin.NLdStQ.amode->tag == ARMamN_RR) {
+            regN = iregNo(i->ARMin.NLdStQ.amode->ARMamN.RR.rN);
+            regM = iregNo(i->ARMin.NLdStQ.amode->ARMamN.RR.rM);
+         } else {
+            regN = iregNo(i->ARMin.NLdStQ.amode->ARMamN.R.rN);
+            regM = 15;
+         }
+         insn = XXXXXXXX(0xF, X0100, BITS4(0, D, bL, 0),
+                              regN, regD, X1010, X1000, regM);
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_NLdStD: {
+         UInt regD = dregNo(i->ARMin.NLdStD.dD);
+         UInt regN, regM;
+         UInt D = regD >> 4;
+         UInt bL = i->ARMin.NLdStD.isLoad ? 1 : 0;
+         UInt insn;
+         vassert(hregClass(i->ARMin.NLdStD.dD) == HRcFlt64);
+         regD &= 0xF;
+         if (i->ARMin.NLdStD.amode->tag == ARMamN_RR) {
+            regN = iregNo(i->ARMin.NLdStD.amode->ARMamN.RR.rN);
+            regM = iregNo(i->ARMin.NLdStD.amode->ARMamN.RR.rM);
+         } else {
+            regN = iregNo(i->ARMin.NLdStD.amode->ARMamN.R.rN);
+            regM = 15;
+         }
+         insn = XXXXXXXX(0xF, X0100, BITS4(0, D, bL, 0),
+                              regN, regD, X0111, X1000, regM);
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_NUnaryS: {
+         UInt Q = i->ARMin.NUnaryS.Q ? 1 : 0;
+         UInt regD, D;
+         UInt regM, M;
+         UInt size = i->ARMin.NUnaryS.size;
+         UInt insn;
+         UInt opc, opc1, opc2;
+         switch (i->ARMin.NUnaryS.op) {
+	    case ARMneon_VDUP:
+               if (i->ARMin.NUnaryS.size >= 16)
+                  goto bad;
+               if (i->ARMin.NUnaryS.dst->tag != ARMNRS_Reg)
+                  goto bad;
+               if (i->ARMin.NUnaryS.src->tag != ARMNRS_Scalar)
+                  goto bad;
+               regD = (hregClass(i->ARMin.NUnaryS.dst->reg) == HRcVec128)
+                        ? (qregNo(i->ARMin.NUnaryS.dst->reg) << 1)
+                        : dregNo(i->ARMin.NUnaryS.dst->reg);
+               regM = (hregClass(i->ARMin.NUnaryS.src->reg) == HRcVec128)
+                        ? (qregNo(i->ARMin.NUnaryS.src->reg) << 1)
+                        : dregNo(i->ARMin.NUnaryS.src->reg);
+               D = regD >> 4;
+               M = regM >> 4;
+               regD &= 0xf;
+               regM &= 0xf;
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1),
+                               (i->ARMin.NUnaryS.size & 0xf), regD,
+                               X1100, BITS4(0,Q,M,0), regM);
+               *p++ = insn;
+               goto done; 
+            case ARMneon_SETELEM:
+               regD = Q ? (qregNo(i->ARMin.NUnaryS.dst->reg) << 1) :
+                                dregNo(i->ARMin.NUnaryS.dst->reg);
+               regM = iregNo(i->ARMin.NUnaryS.src->reg);
+               M = regM >> 4;
+               D = regD >> 4;
+               regM &= 0xF;
+               regD &= 0xF;
+               if (i->ARMin.NUnaryS.dst->tag != ARMNRS_Scalar)
+                  goto bad;
+               switch (size) {
+                  case 0:
+                     if (i->ARMin.NUnaryS.dst->index > 7)
+                        goto bad;
+                     opc = X1000 | i->ARMin.NUnaryS.dst->index;
+                     break;
+                  case 1:
+                     if (i->ARMin.NUnaryS.dst->index > 3)
+                        goto bad;
+                     opc = X0001 | (i->ARMin.NUnaryS.dst->index << 1);
+                     break;
+                  case 2:
+                     if (i->ARMin.NUnaryS.dst->index > 1)
+                        goto bad;
+                     opc = X0000 | (i->ARMin.NUnaryS.dst->index << 2);
+                     break;
+                  default:
+                     goto bad;
+               }
+               opc1 = (opc >> 2) & 3;
+               opc2 = opc & 3;
+               insn = XXXXXXXX(0xE, X1110, BITS4(0,(opc1 >> 1),(opc1 & 1),0),
+                               regD, regM, X1011,
+                               BITS4(D,(opc2 >> 1),(opc2 & 1),1), X0000);
+               *p++ = insn;
+               goto done;
+            case ARMneon_GETELEMU:
+               regM = Q ? (qregNo(i->ARMin.NUnaryS.src->reg) << 1) :
+                                dregNo(i->ARMin.NUnaryS.src->reg);
+               regD = iregNo(i->ARMin.NUnaryS.dst->reg);
+               M = regM >> 4;
+               D = regD >> 4;
+               regM &= 0xF;
+               regD &= 0xF;
+               if (i->ARMin.NUnaryS.src->tag != ARMNRS_Scalar)
+                  goto bad;
+               switch (size) {
+                  case 0:
+                     if (Q && i->ARMin.NUnaryS.src->index > 7) {
+                        regM++;
+                        i->ARMin.NUnaryS.src->index -= 8;
+                     }
+                     if (i->ARMin.NUnaryS.src->index > 7)
+                        goto bad;
+                     opc = X1000 | i->ARMin.NUnaryS.src->index;
+                     break;
+                  case 1:
+                     if (Q && i->ARMin.NUnaryS.src->index > 3) {
+                        regM++;
+                        i->ARMin.NUnaryS.src->index -= 4;
+                     }
+                     if (i->ARMin.NUnaryS.src->index > 3)
+                        goto bad;
+                     opc = X0001 | (i->ARMin.NUnaryS.src->index << 1);
+                     break;
+                  case 2:
+                     goto bad;
+                  default:
+                     goto bad;
+               }
+               opc1 = (opc >> 2) & 3;
+               opc2 = opc & 3;
+               insn = XXXXXXXX(0xE, X1110, BITS4(1,(opc1 >> 1),(opc1 & 1),1),
+                               regM, regD, X1011,
+                               BITS4(M,(opc2 >> 1),(opc2 & 1),1), X0000);
+               *p++ = insn;
+               goto done;
+            case ARMneon_GETELEMS:
+               regM = Q ? (qregNo(i->ARMin.NUnaryS.src->reg) << 1) :
+                                dregNo(i->ARMin.NUnaryS.src->reg);
+               regD = iregNo(i->ARMin.NUnaryS.dst->reg);
+               M = regM >> 4;
+               D = regD >> 4;
+               regM &= 0xF;
+               regD &= 0xF;
+               if (i->ARMin.NUnaryS.src->tag != ARMNRS_Scalar)
+                  goto bad;
+               switch (size) {
+                  case 0:
+                     if (Q && i->ARMin.NUnaryS.src->index > 7) {
+                        regM++;
+                        i->ARMin.NUnaryS.src->index -= 8;
+                     }
+                     if (i->ARMin.NUnaryS.src->index > 7)
+                        goto bad;
+                     opc = X1000 | i->ARMin.NUnaryS.src->index;
+                     break;
+                  case 1:
+                     if (Q && i->ARMin.NUnaryS.src->index > 3) {
+                        regM++;
+                        i->ARMin.NUnaryS.src->index -= 4;
+                     }
+                     if (i->ARMin.NUnaryS.src->index > 3)
+                        goto bad;
+                     opc = X0001 | (i->ARMin.NUnaryS.src->index << 1);
+                     break;
+                  case 2:
+                     if (Q && i->ARMin.NUnaryS.src->index > 1) {
+                        regM++;
+                        i->ARMin.NUnaryS.src->index -= 2;
+                     }
+                     if (i->ARMin.NUnaryS.src->index > 1)
+                        goto bad;
+                     opc = X0000 | (i->ARMin.NUnaryS.src->index << 2);
+                     break;
+                  default:
+                     goto bad;
+               }
+               opc1 = (opc >> 2) & 3;
+               opc2 = opc & 3;
+               insn = XXXXXXXX(0xE, X1110, BITS4(0,(opc1 >> 1),(opc1 & 1),1),
+                               regM, regD, X1011,
+                               BITS4(M,(opc2 >> 1),(opc2 & 1),1), X0000);
+               *p++ = insn;
+               goto done;
+            default:
+               goto bad;
+         }
+      }
+      case ARMin_NUnary: {
+         UInt Q = i->ARMin.NUnary.Q ? 1 : 0;
+         UInt regD = (hregClass(i->ARMin.NUnary.dst) == HRcVec128)
+                       ? (qregNo(i->ARMin.NUnary.dst) << 1)
+                       : dregNo(i->ARMin.NUnary.dst);
+         UInt regM, M;
+         UInt D = regD >> 4;
+         UInt sz1 = i->ARMin.NUnary.size >> 1;
+         UInt sz2 = i->ARMin.NUnary.size & 1;
+         UInt sz = i->ARMin.NUnary.size;
+         UInt insn;
+         UInt F = 0; /* TODO: floating point EQZ ??? */
+         if (i->ARMin.NUnary.op != ARMneon_DUP) {
+            regM = (hregClass(i->ARMin.NUnary.src) == HRcVec128) 
+                     ? (qregNo(i->ARMin.NUnary.src) << 1)
+                     : dregNo(i->ARMin.NUnary.src);
+            M = regM >> 4;
+         } else {
+            regM = iregNo(i->ARMin.NUnary.src);
+            M = regM >> 4;
+         }
+         regD &= 0xF;
+         regM &= 0xF;
+         switch (i->ARMin.NUnary.op) {
+            case ARMneon_COPY: /* VMOV reg, reg */
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,1,0), regM, regD, X0001,
+                               BITS4(M,Q,M,1), regM);
+               break;
+            case ARMneon_COPYN: /* VMOVN regD, regQ */
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+                               regD, X0010, BITS4(0,0,M,0), regM);
+               break;
+            case ARMneon_COPYQNSS: /* VQMOVN regD, regQ */
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+                               regD, X0010, BITS4(1,0,M,0), regM);
+               break;
+            case ARMneon_COPYQNUS: /* VQMOVUN regD, regQ */
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+                               regD, X0010, BITS4(0,1,M,0), regM);
+               break;
+            case ARMneon_COPYQNUU: /* VQMOVN regD, regQ */
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+                               regD, X0010, BITS4(1,1,M,0), regM);
+               break;
+            case ARMneon_COPYLS: /* VMOVL regQ, regD */
+               if (sz >= 3)
+                  goto bad;
+               insn = XXXXXXXX(0xF, X0010,
+                               BITS4(1,D,(sz == 2) ? 1 : 0,(sz == 1) ? 1 : 0),
+                               BITS4((sz == 0) ? 1 : 0,0,0,0),
+                               regD, X1010, BITS4(0,0,M,1), regM);
+               break;
+            case ARMneon_COPYLU: /* VMOVL regQ, regD */
+               if (sz >= 3)
+                  goto bad;
+               insn = XXXXXXXX(0xF, X0011,
+                               BITS4(1,D,(sz == 2) ? 1 : 0,(sz == 1) ? 1 : 0),
+                               BITS4((sz == 0) ? 1 : 0,0,0,0),
+                               regD, X1010, BITS4(0,0,M,1), regM);
+               break;
+            case ARMneon_NOT: /* VMVN reg, reg*/
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X0000, regD, X0101,
+                               BITS4(1,Q,M,0), regM);
+               break;
+            case ARMneon_EQZ:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,1),
+                               regD, BITS4(0,F,0,1), BITS4(0,Q,M,0), regM);
+               break;
+            case ARMneon_CNT:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X0000, regD, X0101,
+                               BITS4(0,Q,M,0), regM);
+               break;
+            case ARMneon_CLZ:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+                               regD, X0100, BITS4(1,Q,M,0), regM);
+               break;
+            case ARMneon_CLS:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+                               regD, X0100, BITS4(0,Q,M,0), regM);
+               break;
+            case ARMneon_ABS:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,1),
+                               regD, X0011, BITS4(0,Q,M,0), regM);
+               break;
+            case ARMneon_DUP:
+               sz1 = i->ARMin.NUnary.size == 0 ? 1 : 0;
+               sz2 = i->ARMin.NUnary.size == 1 ? 1 : 0;
+               vassert(sz1 + sz2 < 2);
+               insn = XXXXXXXX(0xE, X1110, BITS4(1, sz1, Q, 0), regD, regM,
+                               X1011, BITS4(D,0,sz2,1), X0000);
+               break;
+            case ARMneon_REV16:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+                               regD, BITS4(0,0,0,1), BITS4(0,Q,M,0), regM);
+               break;
+            case ARMneon_REV32:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+                               regD, BITS4(0,0,0,0), BITS4(1,Q,M,0), regM);
+               break;
+            case ARMneon_REV64:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+                               regD, BITS4(0,0,0,0), BITS4(0,Q,M,0), regM);
+               break;
+            case ARMneon_PADDLU:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+                               regD, X0010, BITS4(1,Q,M,0), regM);
+               break;
+            case ARMneon_PADDLS:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,0,0),
+                               regD, X0010, BITS4(0,Q,M,0), regM);
+               break;
+            case ARMneon_VQSHLNUU:
+               insn = XXXXXXXX(0xF, X0011,
+                               (1 << 3) | (D << 2) | ((sz >> 4) & 3),
+                               sz & 0xf, regD, X0111,
+                               BITS4(sz >> 6,Q,M,1), regM);
+               break;
+            case ARMneon_VQSHLNSS:
+               insn = XXXXXXXX(0xF, X0010,
+                               (1 << 3) | (D << 2) | ((sz >> 4) & 3),
+                               sz & 0xf, regD, X0111,
+                               BITS4(sz >> 6,Q,M,1), regM);
+               break;
+            case ARMneon_VQSHLNUS:
+               insn = XXXXXXXX(0xF, X0011,
+                               (1 << 3) | (D << 2) | ((sz >> 4) & 3),
+                               sz & 0xf, regD, X0110,
+                               BITS4(sz >> 6,Q,M,1), regM);
+               break;
+            case ARMneon_VCVTFtoS:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0111,
+                               BITS4(0,Q,M,0), regM);
+               break;
+            case ARMneon_VCVTFtoU:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0111,
+                               BITS4(1,Q,M,0), regM);
+               break;
+            case ARMneon_VCVTStoF:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0110,
+                               BITS4(0,Q,M,0), regM);
+               break;
+            case ARMneon_VCVTUtoF:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0110,
+                               BITS4(1,Q,M,0), regM);
+               break;
+            case ARMneon_VCVTFtoFixedU:
+               sz1 = (sz >> 5) & 1;
+               sz2 = (sz >> 4) & 1;
+               sz &= 0xf;
+               insn = XXXXXXXX(0xF, X0011,
+                               BITS4(1,D,sz1,sz2), sz, regD, X1111,
+                               BITS4(0,Q,M,1), regM);
+               break;
+            case ARMneon_VCVTFtoFixedS:
+               sz1 = (sz >> 5) & 1;
+               sz2 = (sz >> 4) & 1;
+               sz &= 0xf;
+               insn = XXXXXXXX(0xF, X0010,
+                               BITS4(1,D,sz1,sz2), sz, regD, X1111,
+                               BITS4(0,Q,M,1), regM);
+               break;
+            case ARMneon_VCVTFixedUtoF:
+               sz1 = (sz >> 5) & 1;
+               sz2 = (sz >> 4) & 1;
+               sz &= 0xf;
+               insn = XXXXXXXX(0xF, X0011,
+                               BITS4(1,D,sz1,sz2), sz, regD, X1110,
+                               BITS4(0,Q,M,1), regM);
+               break;
+            case ARMneon_VCVTFixedStoF:
+               sz1 = (sz >> 5) & 1;
+               sz2 = (sz >> 4) & 1;
+               sz &= 0xf;
+               insn = XXXXXXXX(0xF, X0010,
+                               BITS4(1,D,sz1,sz2), sz, regD, X1110,
+                               BITS4(0,Q,M,1), regM);
+               break;
+            case ARMneon_VCVTF32toF16:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X0110, regD, X0110,
+                               BITS4(0,0,M,0), regM);
+               break;
+            case ARMneon_VCVTF16toF32:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X0110, regD, X0111,
+                               BITS4(0,0,M,0), regM);
+               break;
+            case ARMneon_VRECIP:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0100,
+                               BITS4(0,Q,M,0), regM);
+               break;
+            case ARMneon_VRECIPF:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0101,
+                               BITS4(0,Q,M,0), regM);
+               break;
+            case ARMneon_VABSFP:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1001, regD, X0111,
+                               BITS4(0,Q,M,0), regM);
+               break;
+            case ARMneon_VRSQRTEFP:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0101,
+                               BITS4(1,Q,M,0), regM);
+               break;
+            case ARMneon_VRSQRTE:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1011, regD, X0100,
+                               BITS4(1,Q,M,0), regM);
+               break;
+            case ARMneon_VNEGF:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), X1001, regD, X0111,
+                               BITS4(1,Q,M,0), regM);
+               break;
+
+            default:
+               goto bad;
+         }
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_NDual: {
+         UInt Q = i->ARMin.NDual.Q ? 1 : 0;
+         UInt regD = (hregClass(i->ARMin.NDual.arg1) == HRcVec128)
+                       ? (qregNo(i->ARMin.NDual.arg1) << 1)
+                       : dregNo(i->ARMin.NDual.arg1);
+         UInt regM = (hregClass(i->ARMin.NDual.arg2) == HRcVec128)
+                       ? (qregNo(i->ARMin.NDual.arg2) << 1)
+                       : dregNo(i->ARMin.NDual.arg2);
+         UInt D = regD >> 4;
+         UInt M = regM >> 4;
+         UInt sz1 = i->ARMin.NDual.size >> 1;
+         UInt sz2 = i->ARMin.NDual.size & 1;
+         UInt insn;
+         regD &= 0xF;
+         regM &= 0xF;
+         switch (i->ARMin.NDual.op) {
+            case ARMneon_TRN: /* VTRN reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+                               regD, X0000, BITS4(1,Q,M,0), regM);
+               break;
+            case ARMneon_ZIP: /* VZIP reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+                               regD, X0001, BITS4(1,Q,M,0), regM);
+               break;
+            case ARMneon_UZP: /* VUZP reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), BITS4(sz1,sz2,1,0),
+                               regD, X0001, BITS4(0,Q,M,0), regM);
+               break;
+            default:
+               goto bad;
+         }
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_NBinary: {
+         UInt Q = i->ARMin.NBinary.Q ? 1 : 0;
+         UInt regD = (hregClass(i->ARMin.NBinary.dst) == HRcVec128)
+                       ? (qregNo(i->ARMin.NBinary.dst) << 1)
+                       : dregNo(i->ARMin.NBinary.dst);
+         UInt regN = (hregClass(i->ARMin.NBinary.argL) == HRcVec128)
+                       ? (qregNo(i->ARMin.NBinary.argL) << 1)
+                       : dregNo(i->ARMin.NBinary.argL);
+         UInt regM = (hregClass(i->ARMin.NBinary.argR) == HRcVec128)
+                       ? (qregNo(i->ARMin.NBinary.argR) << 1)
+                       : dregNo(i->ARMin.NBinary.argR);
+         UInt sz1 = i->ARMin.NBinary.size >> 1;
+         UInt sz2 = i->ARMin.NBinary.size & 1;
+         UInt D = regD >> 4;
+         UInt N = regN >> 4;
+         UInt M = regM >> 4;
+         UInt insn;
+         regD &= 0xF;
+         regM &= 0xF;
+         regN &= 0xF;
+         switch (i->ARMin.NBinary.op) {
+            case ARMneon_VAND: /* VAND reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,0,0), regN, regD, X0001,
+                               BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VORR: /* VORR reg, reg, reg*/
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,1,0), regN, regD, X0001,
+                               BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VXOR: /* VEOR reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,0,0), regN, regD, X0001,
+                               BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VADD: /* VADD reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X1000, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VSUB: /* VSUB reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X1000, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VMINU: /* VMIN.Uxx reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0110, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VMINS: /* VMIN.Sxx reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0110, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VMAXU: /* VMAX.Uxx reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0110, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VMAXS: /* VMAX.Sxx reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0110, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VRHADDS: /* VRHADD.Sxx reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0001, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VRHADDU: /* VRHADD.Uxx reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0001, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VQADDU: /* VQADD unsigned reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0000, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VQADDS: /* VQADD signed reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0000, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VQSUBU: /* VQSUB unsigned reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0010, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VQSUBS: /* VQSUB signed reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0010, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VCGTU: /* VCGT unsigned reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0011, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VCGTS: /* VCGT signed reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0011, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VCGEU: /* VCGE unsigned reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0011, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VCGES: /* VCGE signed reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0011, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VCEQ: /* VCEQ reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X1000, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VEXT: /* VEXT.8 reg, reg, #imm4*/
+               if (i->ARMin.NBinary.size >= 16)
+                  goto bad;
+               insn = XXXXXXXX(0xF, X0010, BITS4(1,D,1,1), regN, regD,
+                               i->ARMin.NBinary.size & 0xf, BITS4(N,Q,M,0),
+                               regM);
+               break;
+            case ARMneon_VMUL:
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X1001, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VMULLU:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,sz1,sz2), regN, regD,
+                               X1100, BITS4(N,0,M,0), regM);
+               break;
+            case ARMneon_VMULLS:
+               insn = XXXXXXXX(0xF, X0010, BITS4(1,D,sz1,sz2), regN, regD,
+                               X1100, BITS4(N,0,M,0), regM);
+               break;
+            case ARMneon_VMULP:
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X1001, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VMULFP:
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,0,0), regN, regD,
+                               X1101, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VMULLP:
+               insn = XXXXXXXX(0xF, X0010, BITS4(1,D,sz1,sz2), regN, regD,
+                               X1110, BITS4(N,0,M,0), regM);
+               break;
+            case ARMneon_VQDMULH:
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X1011, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VQRDMULH:
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X1011, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VQDMULL:
+               insn = XXXXXXXX(0xF, X0010, BITS4(1,D,sz1,sz2), regN, regD,
+                               X1101, BITS4(N,0,M,0), regM);
+               break;
+            case ARMneon_VTBL:
+               insn = XXXXXXXX(0xF, X0011, BITS4(1,D,1,1), regN, regD,
+                               X1000, BITS4(N,0,M,0), regM);
+               break;
+            case ARMneon_VPADD:
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X1011, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VPADDFP:
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,0,0), regN, regD,
+                               X1101, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VPMINU:
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X1010, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VPMINS:
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X1010, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VPMAXU:
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X1010, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VPMAXS:
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X1010, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VADDFP: /* VADD reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,0,0), regN, regD,
+                               X1101, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VSUBFP: /* VADD reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,1,0), regN, regD,
+                               X1101, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VABDFP: /* VABD reg, reg, reg */
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,1,0), regN, regD,
+                               X1101, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VMINF:
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,1,0), regN, regD,
+                               X1111, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VMAXF:
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,0,0), regN, regD,
+                               X1111, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VPMINF:
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,1,0), regN, regD,
+                               X1111, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VPMAXF:
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,0,0), regN, regD,
+                               X1111, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VRECPS:
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,0,0), regN, regD, X1111,
+                               BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VCGTF:
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,1,0), regN, regD, X1110,
+                               BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VCGEF:
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,0,0), regN, regD, X1110,
+                               BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VCEQF:
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,0,0), regN, regD, X1110,
+                               BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VRSQRTS:
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,1,0), regN, regD, X1111,
+                               BITS4(N,Q,M,1), regM);
+               break;
+            default:
+               goto bad;
+         }
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_NShift: {
+         UInt Q = i->ARMin.NShift.Q ? 1 : 0;
+         UInt regD = (hregClass(i->ARMin.NShift.dst) == HRcVec128)
+                       ? (qregNo(i->ARMin.NShift.dst) << 1)
+                       : dregNo(i->ARMin.NShift.dst);
+         UInt regM = (hregClass(i->ARMin.NShift.argL) == HRcVec128)
+                       ? (qregNo(i->ARMin.NShift.argL) << 1)
+                       : dregNo(i->ARMin.NShift.argL);
+         UInt regN = (hregClass(i->ARMin.NShift.argR) == HRcVec128)
+                       ? (qregNo(i->ARMin.NShift.argR) << 1)
+                       : dregNo(i->ARMin.NShift.argR);
+         UInt sz1 = i->ARMin.NShift.size >> 1;
+         UInt sz2 = i->ARMin.NShift.size & 1;
+         UInt D = regD >> 4;
+         UInt N = regN >> 4;
+         UInt M = regM >> 4;
+         UInt insn;
+         regD &= 0xF;
+         regM &= 0xF;
+         regN &= 0xF;
+         switch (i->ARMin.NShift.op) {
+            case ARMneon_VSHL:
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0100, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VSAL:
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0100, BITS4(N,Q,M,0), regM);
+               break;
+            case ARMneon_VQSHL:
+               insn = XXXXXXXX(0xF, X0011, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0100, BITS4(N,Q,M,1), regM);
+               break;
+            case ARMneon_VQSAL:
+               insn = XXXXXXXX(0xF, X0010, BITS4(0,D,sz1,sz2), regN, regD,
+                               X0100, BITS4(N,Q,M,1), regM);
+               break;
+            default:
+               goto bad;
+         }
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_NeonImm: {
+         UInt Q = (hregClass(i->ARMin.NeonImm.dst) == HRcVec128) ? 1 : 0;
+         UInt regD = Q ? (qregNo(i->ARMin.NeonImm.dst) << 1) :
+                          dregNo(i->ARMin.NeonImm.dst);
+         UInt D = regD >> 4;
+         UInt imm = i->ARMin.NeonImm.imm->imm8;
+         UInt tp = i->ARMin.NeonImm.imm->type;
+         UInt j = imm >> 7;
+         UInt imm3 = (imm >> 4) & 0x7;
+         UInt imm4 = imm & 0xF;
+         UInt cmode, op;
+         UInt insn;
+         regD &= 0xF;
+         if (tp == 9)
+            op = 1;
+         else
+            op = 0;
+         switch (tp) {
+            case 0:
+            case 1:
+            case 2:
+            case 3:
+            case 4:
+            case 5:
+               cmode = tp << 1;
+               break;
+            case 9:
+            case 6:
+               cmode = 14;
+               break;
+            case 7:
+               cmode = 12;
+               break;
+            case 8:
+               cmode = 13;
+               break;
+            case 10:
+               cmode = 15;
+               break;
+            default:
+               vpanic("ARMin_NeonImm");
+
+         }
+         insn = XXXXXXXX(0xF, BITS4(0,0,1,j), BITS4(1,D,0,0), imm3, regD,
+                         cmode, BITS4(0,Q,op,1), imm4);
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_NCMovQ: {
+         UInt cc = (UInt)i->ARMin.NCMovQ.cond;
+         UInt qM = qregNo(i->ARMin.NCMovQ.src) << 1;
+         UInt qD = qregNo(i->ARMin.NCMovQ.dst) << 1;
+         UInt vM = qM & 0xF;
+         UInt vD = qD & 0xF;
+         UInt M  = (qM >> 4) & 1;
+         UInt D  = (qD >> 4) & 1;
+         vassert(cc < 16 && cc != ARMcc_AL && cc != ARMcc_NV);
+         /* b!cc here+8: !cc A00 0000 */
+         UInt insn = XXXXXXXX(cc ^ 1, 0xA, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0);
+         *p++ = insn;
+         /* vmov qD, qM */
+         insn = XXXXXXXX(0xF, 0x2, BITS4(0,D,1,0),
+                         vM, vD, BITS4(0,0,0,1), BITS4(M,1,M,1), vM);
+         *p++ = insn;
+         goto done;
+      }
+      case ARMin_Add32: {
+         UInt regD = iregNo(i->ARMin.Add32.rD);
+         UInt regN = iregNo(i->ARMin.Add32.rN);
+         UInt imm32 = i->ARMin.Add32.imm32;
+         vassert(regD != regN);
+         /* MOV regD, imm32 */
+         p = imm32_to_iregNo((UInt *)p, regD, imm32);
+         /* ADD regD, regN, regD */
+         UInt insn = XXXXXXXX(0xE, 0, X1000, regN, regD, 0, 0, regD);
+         *p++ = insn;
+         goto done;
+      }
+      /* ... */
+      default: 
+         goto bad;
+    }
+
+  bad:
+   ppARMInstr(i);
+   vpanic("emit_ARMInstr");
+   /*NOTREACHED*/
+
+  done:
+   vassert(((UChar*)p) - &buf[0] <= 32);
+   return ((UChar*)p) - &buf[0];
+}
+
+#undef BITS4
+#undef X0000
+#undef X0001
+#undef X0010
+#undef X0011
+#undef X0100
+#undef X0101
+#undef X0110
+#undef X0111
+#undef X1000
+#undef X1001
+#undef X1010
+#undef X1011
+#undef X1100
+#undef X1101
+#undef X1110
+#undef X1111
+#undef XXXXX___
+#undef XXXXXX__
+#undef XXX___XX
+#undef XXXXX__X
+#undef XXXXXXXX
+
+/*---------------------------------------------------------------*/
+/*--- end                                     host_arm_defs.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_arm_defs.h b/VEX/priv/host_arm_defs.h
new file mode 100644
index 0000000..1901e80
--- /dev/null
+++ b/VEX/priv/host_arm_defs.h

@@ -0,0 +1,978 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                   host_arm_defs.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#ifndef __VEX_HOST_ARM_DEFS_H
+#define __VEX_HOST_ARM_DEFS_H
+
+extern UInt arm_hwcaps;
+
+
+/* --------- Registers. --------- */
+
+/* The usual HReg abstraction.
+   There are 16 general purpose regs.
+*/
+
+extern void ppHRegARM ( HReg );
+
+extern HReg hregARM_R0  ( void );
+extern HReg hregARM_R1  ( void );
+extern HReg hregARM_R2  ( void );
+extern HReg hregARM_R3  ( void );
+extern HReg hregARM_R4  ( void );
+extern HReg hregARM_R5  ( void );
+extern HReg hregARM_R6  ( void );
+extern HReg hregARM_R7  ( void );
+extern HReg hregARM_R8  ( void );
+extern HReg hregARM_R9  ( void );
+extern HReg hregARM_R10 ( void );
+extern HReg hregARM_R11 ( void );
+extern HReg hregARM_R12 ( void );
+extern HReg hregARM_R13 ( void );
+extern HReg hregARM_R14 ( void );
+extern HReg hregARM_R15 ( void );
+extern HReg hregARM_D8  ( void );
+extern HReg hregARM_D9  ( void );
+extern HReg hregARM_D10 ( void );
+extern HReg hregARM_D11 ( void );
+extern HReg hregARM_D12 ( void );
+extern HReg hregARM_S26 ( void );
+extern HReg hregARM_S27 ( void );
+extern HReg hregARM_S28 ( void );
+extern HReg hregARM_S29 ( void );
+extern HReg hregARM_S30 ( void );
+extern HReg hregARM_Q8  ( void );
+extern HReg hregARM_Q9  ( void );
+extern HReg hregARM_Q10 ( void );
+extern HReg hregARM_Q11 ( void );
+extern HReg hregARM_Q12 ( void );
+extern HReg hregARM_Q13 ( void );
+extern HReg hregARM_Q14 ( void );
+extern HReg hregARM_Q15 ( void );
+
+/* Number of registers used arg passing in function calls */
+#define ARM_N_ARGREGS 4   /* r0, r1, r2, r3 */
+
+
+/* --------- Condition codes. --------- */
+
+typedef
+   enum {
+      ARMcc_EQ  = 0,  /* equal                          : Z=1 */
+      ARMcc_NE  = 1,  /* not equal                      : Z=0 */
+
+      ARMcc_HS  = 2,  /* >=u (higher or same)           : C=1 */
+      ARMcc_LO  = 3,  /* <u  (lower)                    : C=0 */
+
+      ARMcc_MI  = 4,  /* minus (negative)               : N=1 */
+      ARMcc_PL  = 5,  /* plus (zero or +ve)             : N=0 */
+
+      ARMcc_VS  = 6,  /* overflow                       : V=1 */
+      ARMcc_VC  = 7,  /* no overflow                    : V=0 */
+
+      ARMcc_HI  = 8,  /* >u   (higher)                  : C=1 && Z=0 */
+      ARMcc_LS  = 9,  /* <=u  (lower or same)           : C=0 || Z=1 */
+
+      ARMcc_GE  = 10, /* >=s (signed greater or equal)  : N=V */
+      ARMcc_LT  = 11, /* <s  (signed less than)         : N!=V */
+
+      ARMcc_GT  = 12, /* >s  (signed greater)           : Z=0 && N=V */
+      ARMcc_LE  = 13, /* <=s (signed less or equal)     : Z=1 || N!=V */
+
+      ARMcc_AL  = 14, /* always (unconditional) */
+      ARMcc_NV  = 15  /* never (basically undefined meaning), deprecated */
+   }
+   ARMCondCode;
+
+extern HChar* showARMCondCode ( ARMCondCode );
+
+
+
+/* --------- Memory address expressions (amodes). --------- */
+
+/* --- Addressing Mode 1 --- */
+typedef
+   enum {
+      ARMam1_RI=1,   /* reg +/- imm12 */
+      ARMam1_RRS     /* reg1 + (reg2 << 0, 1 2 or 3) */
+   }
+   ARMAMode1Tag;
+
+typedef
+   struct {
+      ARMAMode1Tag tag;
+      union {
+         struct {
+            HReg reg;
+            Int  simm13; /* -4095 .. +4095 */
+         } RI;
+         struct {
+            HReg base;
+            HReg index;
+            UInt shift; /* 0, 1 2 or 3 */
+         } RRS;
+      } ARMam1;
+   }
+   ARMAMode1;
+
+extern ARMAMode1* ARMAMode1_RI  ( HReg reg, Int simm13 );
+extern ARMAMode1* ARMAMode1_RRS ( HReg base, HReg index, UInt shift );
+
+extern void ppARMAMode1 ( ARMAMode1* );
+
+
+/* --- Addressing Mode 2 --- */
+typedef
+   enum {
+      ARMam2_RI=3,   /* reg +/- imm8 */
+      ARMam2_RR      /* reg1 + reg2 */
+   }
+   ARMAMode2Tag;
+
+typedef
+   struct {
+      ARMAMode2Tag tag;
+      union {
+         struct {
+            HReg reg;
+            Int  simm9; /* -255 .. 255 */
+         } RI;
+         struct {
+            HReg base;
+            HReg index;
+         } RR;
+      } ARMam2;
+   }
+   ARMAMode2;
+
+extern ARMAMode2* ARMAMode2_RI ( HReg reg, Int simm9 );
+extern ARMAMode2* ARMAMode2_RR ( HReg base, HReg index );
+
+extern void ppARMAMode2 ( ARMAMode2* );
+
+
+/* --- Addressing Mode suitable for VFP --- */
+/* The simm11 is encoded as 8 bits + 1 sign bit,
+   so can only be 0 % 4. */
+typedef
+   struct {
+      HReg reg;
+      Int  simm11; /* -1020, -1016 .. 1016, 1020 */
+   }
+   ARMAModeV;
+
+extern ARMAModeV* mkARMAModeV ( HReg reg, Int simm11 );
+
+extern void ppARMAModeV ( ARMAModeV* );
+
+/* --- Addressing Mode suitable for Neon --- */
+typedef
+   enum {
+      ARMamN_R=5,
+      ARMamN_RR
+      /* ... */
+   }
+   ARMAModeNTag;
+
+typedef
+   struct {
+      ARMAModeNTag tag;
+      union {
+         struct {
+            HReg rN;
+            HReg rM;
+         } RR;
+         struct {
+            HReg rN;
+         } R;
+         /* ... */
+      } ARMamN;
+   }
+   ARMAModeN;
+
+extern ARMAModeN* mkARMAModeN_RR ( HReg, HReg );
+extern ARMAModeN* mkARMAModeN_R ( HReg );
+extern void ppARMAModeN ( ARMAModeN* );
+
+/* --------- Reg or imm-8x4 operands --------- */
+/* a.k.a (a very restricted form of) Shifter Operand,
+   in the ARM parlance. */
+
+typedef
+   enum {
+      ARMri84_I84=7,   /* imm8 `ror` (2 * imm4) */
+      ARMri84_R        /* reg */
+   }
+   ARMRI84Tag;
+
+typedef
+   struct {
+      ARMRI84Tag tag;
+      union {
+         struct {
+            UShort imm8;
+            UShort imm4;
+         } I84;
+         struct {
+            HReg reg;
+         } R;
+      } ARMri84;
+   }
+   ARMRI84;
+
+extern ARMRI84* ARMRI84_I84 ( UShort imm8, UShort imm4 );
+extern ARMRI84* ARMRI84_R   ( HReg );
+
+extern void ppARMRI84 ( ARMRI84* );
+
+
+/* --------- Reg or imm5 operands --------- */
+typedef
+   enum {
+      ARMri5_I5=9,   /* imm5, 1 .. 31 only (no zero!) */
+      ARMri5_R       /* reg */
+   }
+   ARMRI5Tag;
+
+typedef
+   struct {
+      ARMRI5Tag tag;
+      union {
+         struct {
+            UInt imm5;
+         } I5;
+         struct {
+            HReg reg;
+         } R;
+      } ARMri5;
+   }
+   ARMRI5;
+
+extern ARMRI5* ARMRI5_I5 ( UInt imm5 );
+extern ARMRI5* ARMRI5_R  ( HReg );
+
+extern void ppARMRI5 ( ARMRI5* );
+
+/* -------- Neon Immediate operand -------- */
+
+/* imm8 = abcdefgh, B = NOT(b);
+
+type | value (64bit binary)
+-----+-------------------------------------------------------------------------
+   0 | 00000000 00000000 00000000 abcdefgh 00000000 00000000 00000000 abcdefgh
+   1 | 00000000 00000000 abcdefgh 00000000 00000000 00000000 abcdefgh 00000000
+   2 | 00000000 abcdefgh 00000000 00000000 00000000 abcdefgh 00000000 00000000
+   3 | abcdefgh 00000000 00000000 00000000 abcdefgh 00000000 00000000 00000000
+   4 | 00000000 abcdefgh 00000000 abcdefgh 00000000 abcdefgh 00000000 abcdefgh
+   5 | abcdefgh 00000000 abcdefgh 00000000 abcdefgh 00000000 abcdefgh 00000000
+   6 | abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh abcdefgh
+   7 | 00000000 00000000 abcdefgh 11111111 00000000 00000000 abcdefgh 11111111
+   8 | 00000000 abcdefgh 11111111 11111111 00000000 abcdefgh 11111111 11111111
+   9 | aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+  10 | aBbbbbbc defgh000 00000000 00000000 aBbbbbbc defgh000 00000000 00000000
+-----+-------------------------------------------------------------------------
+
+Type 10 is:
+   (-1)^S * 2^exp * mantissa
+where S = a, exp = UInt(B:c:d) - 3, mantissa = (16 + UInt(e:f:g:h)) / 16
+*/
+
+typedef
+   struct {
+      UInt type;
+      UInt imm8;
+   }
+   ARMNImm;
+
+extern ARMNImm* ARMNImm_TI ( UInt type, UInt imm8 );
+extern ULong ARMNImm_to_Imm64 ( ARMNImm* );
+extern ARMNImm* Imm64_to_ARMNImm ( ULong );
+
+extern void ppARMNImm ( ARMNImm* );
+
+/* ------ Neon Register or Scalar Operand ------ */
+
+typedef
+   enum {
+      ARMNRS_Reg=11,
+      ARMNRS_Scalar
+   }
+   ARMNRS_tag;
+
+typedef
+   struct {
+      ARMNRS_tag tag;
+      HReg reg;
+      UInt index;
+   }
+   ARMNRS;
+
+extern ARMNRS* mkARMNRS(ARMNRS_tag, HReg reg, UInt index);
+extern void ppARMNRS ( ARMNRS* );
+
+/* --------- Instructions. --------- */
+
+/* --------- */
+typedef
+   enum {
+      ARMalu_ADD=20,   /* plain 32-bit add */
+      ARMalu_ADDS,     /* 32-bit add, and set the flags */
+      ARMalu_ADC,      /* 32-bit add with carry */
+      ARMalu_SUB,      /* plain 32-bit subtract */
+      ARMalu_SUBS,     /* 32-bit subtract, and set the flags */
+      ARMalu_SBC,      /* 32-bit subtract with carry */
+      ARMalu_AND,
+      ARMalu_BIC,
+      ARMalu_OR,
+      ARMalu_XOR
+   }
+   ARMAluOp;
+
+extern HChar* showARMAluOp ( ARMAluOp op );
+
+
+typedef
+   enum {
+      ARMsh_SHL=40,
+      ARMsh_SHR,
+      ARMsh_SAR
+   }
+   ARMShiftOp;
+
+extern HChar* showARMShiftOp ( ARMShiftOp op );
+
+
+typedef
+   enum {
+      ARMun_NEG=50,
+      ARMun_NOT,
+      ARMun_CLZ
+   }
+   ARMUnaryOp;
+
+extern HChar* showARMUnaryOp ( ARMUnaryOp op );
+
+
+typedef
+   enum {
+      ARMmul_PLAIN=60,
+      ARMmul_ZX,
+      ARMmul_SX
+   }
+   ARMMulOp;
+
+extern HChar* showARMMulOp ( ARMMulOp op );
+
+
+typedef
+   enum {
+      ARMvfp_ADD=70,
+      ARMvfp_SUB,
+      ARMvfp_MUL,
+      ARMvfp_DIV
+   }
+   ARMVfpOp;
+
+extern HChar* showARMVfpOp ( ARMVfpOp op );
+
+
+typedef
+   enum {
+      ARMvfpu_COPY=80,
+      ARMvfpu_NEG,
+      ARMvfpu_ABS,
+      ARMvfpu_SQRT
+   }
+   ARMVfpUnaryOp;
+
+extern HChar* showARMVfpUnaryOp ( ARMVfpUnaryOp op );
+
+typedef
+   enum {
+      ARMneon_VAND=90,
+      ARMneon_VORR,
+      ARMneon_VXOR,
+      ARMneon_VADD,
+      ARMneon_VADDFP,
+      ARMneon_VRHADDS,
+      ARMneon_VRHADDU,
+      ARMneon_VPADDFP,
+      ARMneon_VABDFP,
+      ARMneon_VSUB,
+      ARMneon_VSUBFP,
+      ARMneon_VMAXU,
+      ARMneon_VMAXS,
+      ARMneon_VMAXF,
+      ARMneon_VMINU,
+      ARMneon_VMINS,
+      ARMneon_VMINF,
+      ARMneon_VQADDU,
+      ARMneon_VQADDS,
+      ARMneon_VQSUBU,
+      ARMneon_VQSUBS,
+      ARMneon_VCGTU,
+      ARMneon_VCGTS,
+      ARMneon_VCGEU,
+      ARMneon_VCGES,
+      ARMneon_VCGTF,
+      ARMneon_VCGEF,
+      ARMneon_VCEQ,
+      ARMneon_VCEQF,
+      ARMneon_VEXT,
+      ARMneon_VMUL,
+      ARMneon_VMULFP,
+      ARMneon_VMULLU,
+      ARMneon_VMULLS,
+      ARMneon_VMULP,
+      ARMneon_VMULLP,
+      ARMneon_VQDMULH,
+      ARMneon_VQRDMULH,
+      ARMneon_VPADD,
+      ARMneon_VPMINU,
+      ARMneon_VPMINS,
+      ARMneon_VPMINF,
+      ARMneon_VPMAXU,
+      ARMneon_VPMAXS,
+      ARMneon_VPMAXF,
+      ARMneon_VTBL,
+      ARMneon_VQDMULL,
+      ARMneon_VRECPS,
+      ARMneon_VRSQRTS,
+      /* ... */
+   }
+   ARMNeonBinOp;
+
+typedef
+   enum {
+      ARMneon_VSHL=150,
+      ARMneon_VSAL, /* Yah, not SAR but SAL */
+      ARMneon_VQSHL,
+      ARMneon_VQSAL
+   }
+   ARMNeonShiftOp;
+
+typedef
+   enum {
+      ARMneon_COPY=160,
+      ARMneon_COPYLU,
+      ARMneon_COPYLS,
+      ARMneon_COPYN,
+      ARMneon_COPYQNSS,
+      ARMneon_COPYQNUS,
+      ARMneon_COPYQNUU,
+      ARMneon_NOT,
+      ARMneon_EQZ,
+      ARMneon_DUP,
+      ARMneon_PADDLS,
+      ARMneon_PADDLU,
+      ARMneon_CNT,
+      ARMneon_CLZ,
+      ARMneon_CLS,
+      ARMneon_VCVTxFPxINT,
+      ARMneon_VQSHLNSS,
+      ARMneon_VQSHLNUU,
+      ARMneon_VQSHLNUS,
+      ARMneon_VCVTFtoU,
+      ARMneon_VCVTFtoS,
+      ARMneon_VCVTUtoF,
+      ARMneon_VCVTStoF,
+      ARMneon_VCVTFtoFixedU,
+      ARMneon_VCVTFtoFixedS,
+      ARMneon_VCVTFixedUtoF,
+      ARMneon_VCVTFixedStoF,
+      ARMneon_VCVTF16toF32,
+      ARMneon_VCVTF32toF16,
+      ARMneon_REV16,
+      ARMneon_REV32,
+      ARMneon_REV64,
+      ARMneon_ABS,
+      ARMneon_VNEGF,
+      ARMneon_VRECIP,
+      ARMneon_VRECIPF,
+      ARMneon_VABSFP,
+      ARMneon_VRSQRTEFP,
+      ARMneon_VRSQRTE
+      /* ... */
+   }
+   ARMNeonUnOp;
+
+typedef
+   enum {
+      ARMneon_SETELEM=200,
+      ARMneon_GETELEMU,
+      ARMneon_GETELEMS,
+      ARMneon_VDUP,
+   }
+   ARMNeonUnOpS;
+
+typedef
+   enum {
+      ARMneon_TRN=210,
+      ARMneon_ZIP,
+      ARMneon_UZP
+      /* ... */
+   }
+   ARMNeonDualOp;
+
+extern HChar* showARMNeonBinOp ( ARMNeonBinOp op );
+extern HChar* showARMNeonUnOp ( ARMNeonUnOp op );
+extern HChar* showARMNeonUnOpS ( ARMNeonUnOpS op );
+extern HChar* showARMNeonShiftOp ( ARMNeonShiftOp op );
+extern HChar* showARMNeonDualOp ( ARMNeonDualOp op );
+extern HChar* showARMNeonBinOpDataType ( ARMNeonBinOp op );
+extern HChar* showARMNeonUnOpDataType ( ARMNeonUnOp op );
+extern HChar* showARMNeonUnOpSDataType ( ARMNeonUnOpS op );
+extern HChar* showARMNeonShiftOpDataType ( ARMNeonShiftOp op );
+extern HChar* showARMNeonDualOpDataType ( ARMNeonDualOp op );
+
+typedef
+   enum {
+      /* baseline */
+      ARMin_Alu=220,
+      ARMin_Shift,
+      ARMin_Unary,
+      ARMin_CmpOrTst,
+      ARMin_Mov,
+      ARMin_Imm32,
+      ARMin_LdSt32,
+      ARMin_LdSt16,
+      ARMin_LdSt8U,
+      ARMin_Ld8S,
+      ARMin_Goto,
+      ARMin_CMov,
+      ARMin_Call,
+      ARMin_Mul,
+      ARMin_LdrEX,
+      ARMin_StrEX,
+      /* vfp */
+      ARMin_VLdStD,
+      ARMin_VLdStS,
+      ARMin_VAluD,
+      ARMin_VAluS,
+      ARMin_VUnaryD,
+      ARMin_VUnaryS,
+      ARMin_VCmpD,
+      ARMin_VCMovD,
+      ARMin_VCMovS,
+      ARMin_VCvtSD,
+      ARMin_VXferD,
+      ARMin_VXferS,
+      ARMin_VCvtID,
+      ARMin_FPSCR,
+      ARMin_MFence,
+      /* Neon */
+      ARMin_NLdStQ,
+      ARMin_NLdStD,
+      ARMin_NUnary,
+      ARMin_NUnaryS,
+      ARMin_NDual,
+      ARMin_NBinary,
+      ARMin_NBinaryS,
+      ARMin_NShift,
+      ARMin_NeonImm,
+      ARMin_NCMovQ,
+      /* This is not a NEON instruction. Actually there is no corresponding
+         instruction in ARM instruction set at all. We need this one to
+         generate spill/reload of 128-bit registers since current register
+         allocator demands them to consist of no more than two instructions.
+         We will split this instruction into 2 or 3 ARM instructions on the
+         emiting phase.
+
+         NOTE: source and destination registers should be different! */
+      ARMin_Add32
+   }
+   ARMInstrTag;
+
+/* Destinations are on the LEFT (first operand) */
+
+typedef
+   struct {
+      ARMInstrTag tag;
+      union {
+         /* ADD/SUB/AND/OR/XOR, vanilla ALU op */
+         struct {
+            ARMAluOp op;
+            HReg     dst;
+            HReg     argL;
+            ARMRI84* argR;
+         } Alu;
+         /* SHL/SHR/SAR, 2nd arg is reg or imm */
+         struct {
+            ARMShiftOp op;
+            HReg       dst;
+            HReg       argL;
+            ARMRI5*    argR;
+         } Shift;
+         /* NOT/NEG/CLZ */
+         struct {
+            ARMUnaryOp op;
+            HReg       dst;
+            HReg       src;
+         } Unary;
+         /* CMP/TST; subtract/and, discard result, set NZCV */
+         struct {
+            Bool     isCmp;
+            HReg     argL;
+            ARMRI84* argR;
+         } CmpOrTst;
+         /* MOV dst, src -- reg-reg (or reg-imm8x4) move */
+         struct {
+            HReg     dst;
+            ARMRI84* src;
+         } Mov;
+         /* Pseudo-insn; make a 32-bit immediate */
+         struct {
+            HReg dst;
+            UInt imm32;
+         } Imm32;
+         /* 32-bit load or store */
+         struct {
+            Bool       isLoad;
+            HReg       rD;
+            ARMAMode1* amode;
+         } LdSt32;
+         /* 16-bit load or store */
+         struct {
+            Bool       isLoad;
+            Bool       signedLoad;
+            HReg       rD;
+            ARMAMode2* amode;
+         } LdSt16;
+         /* 8-bit (unsigned) load or store */
+         struct {
+            Bool       isLoad;
+            HReg       rD;
+            ARMAMode1* amode;
+         } LdSt8U;
+         /* 8-bit signed load */
+         struct {
+            HReg       rD;
+            ARMAMode2* amode;
+         } Ld8S;
+         /* Pseudo-insn.  Go to guest address gnext, on given
+            condition, which could be ARMcc_AL. */
+         struct {
+            IRJumpKind  jk;
+            ARMCondCode cond;
+            HReg        gnext;
+         } Goto;
+         /* Mov src to dst on the given condition, which may not
+            be ARMcc_AL. */
+         struct {
+            ARMCondCode cond;
+            HReg        dst;
+            ARMRI84*    src;
+         } CMov;
+         /* Pseudo-insn.  Call target (an absolute address), on given
+            condition (which could be ARMcc_AL). */
+         struct {
+            ARMCondCode cond;
+            HWord       target;
+            Int         nArgRegs; /* # regs carrying args: 0 .. 4 */
+         } Call;
+         /* (PLAIN) 32 *  32 -> 32:  r0    = r2 * r3
+            (ZX)    32 *u 32 -> 64:  r1:r0 = r2 *u r3
+            (SX)    32 *s 32 -> 64:  r1:r0 = r2 *s r3
+            Why hardwired registers?  Because the ARM ARM specifies
+            (eg for straight MUL) the result (Rd) and the left arg (Rm)
+            may not be the same register.  That's not a constraint we
+            can enforce in the register allocator (without mucho extra
+            complexity).  Hence hardwire it.  At least using caller-saves
+            registers, which are less likely to be in use. */
+         struct {
+            ARMMulOp op;
+         } Mul;
+         /* LDREX{,H,B} r0, [r1]
+            Again, hardwired registers since this is not performance
+            critical, and there are possibly constraints on the
+            registers that we can't express in the register allocator.*/
+         struct {
+            Int  szB; /* currently only 4 is allowed */
+         } LdrEX;
+         /* STREX{,H,B} r0, r1, [r2]
+            r0 = SC( [r2] = r1 )
+            Ditto comment re fixed registers. */
+         struct {
+            Int  szB; /* currently only 4 is allowed */
+         } StrEX;
+         /* VFP INSTRUCTIONS */
+         /* 64-bit Fp load/store */
+         struct {
+            Bool       isLoad;
+            HReg       dD;
+            ARMAModeV* amode;
+         } VLdStD;
+         /* 32-bit Fp load/store */
+         struct {
+            Bool       isLoad;
+            HReg       fD;
+            ARMAModeV* amode;
+         } VLdStS;
+         /* 64-bit FP binary arithmetic */
+         struct {
+            ARMVfpOp op;
+            HReg     dst;
+            HReg     argL;
+            HReg     argR;
+         } VAluD;
+         /* 32-bit FP binary arithmetic */
+         struct {
+            ARMVfpOp op;
+            HReg     dst;
+            HReg     argL;
+            HReg     argR;
+         } VAluS;
+         /* 64-bit FP unary, also reg-reg move */
+         struct {
+            ARMVfpUnaryOp op;
+            HReg          dst;
+            HReg          src;
+         } VUnaryD;
+         /* 32-bit FP unary, also reg-reg move */
+         struct {
+            ARMVfpUnaryOp op;
+            HReg          dst;
+            HReg          src;
+         } VUnaryS;
+         /* 64-bit FP compare and move results to CPSR (FCMPD;FMSTAT) */
+         struct {
+            HReg argL;
+            HReg argR;
+         } VCmpD;
+         /* 64-bit FP mov src to dst on the given condition, which may
+            not be ARMcc_AL. */
+         struct {
+            ARMCondCode cond;
+            HReg        dst;
+            HReg        src;
+         } VCMovD;
+         /* 32-bit FP mov src to dst on the given condition, which may
+            not be ARMcc_AL. */
+         struct {
+            ARMCondCode cond;
+            HReg        dst;
+            HReg        src;
+         } VCMovS;
+         /* Convert between 32-bit and 64-bit FP values (both ways).
+            (FCVTSD, FCVTDS) */
+         struct {
+            Bool sToD; /* True: F32->F64.  False: F64->F32 */
+            HReg dst;
+            HReg src;
+         } VCvtSD;
+         /* Transfer a VFP D reg to/from two integer registers (VMOV) */
+         struct {
+            Bool toD;
+            HReg dD;
+            HReg rHi;
+            HReg rLo;
+         } VXferD;
+         /* Transfer a VFP S reg to/from an integer register (VMOV) */
+         struct {
+            Bool toS;
+            HReg fD;
+            HReg rLo;
+         } VXferS;
+         /* Convert between 32-bit ints and 64-bit FP values (both ways
+            and both signednesses). (FSITOD, FUITOD, FTOSID, FTOUID) */
+         struct {
+            Bool iToD; /* True: I32->F64.  False: F64->I32 */
+            Bool syned; /* True: I32 is signed.  False: I32 is unsigned */
+            HReg dst;
+            HReg src;
+         } VCvtID;
+         /* Move a 32-bit value to/from the FPSCR (FMXR, FMRX) */
+         struct {
+            Bool toFPSCR;
+            HReg iReg;
+         } FPSCR;
+         /* Mem fence.  An insn which fences all loads and stores as
+            much as possible before continuing.  On ARM we emit the
+            sequence
+               mcr 15,0,r0,c7,c10,4 (DSB)
+               mcr 15,0,r0,c7,c10,5 (DMB)
+               mcr 15,0,r0,c7,c5,4 (ISB)
+            which is probably total overkill, but better safe than
+            sorry.
+         */
+         struct {
+         } MFence;
+         /* Neon data processing instruction: 3 registers of the same
+            length */
+         struct {
+            ARMNeonBinOp op;
+            HReg dst;
+            HReg argL;
+            HReg argR;
+            UInt size;
+            Bool Q;
+         } NBinary;
+         struct {
+            ARMNeonBinOp op;
+            ARMNRS* dst;
+            ARMNRS* argL;
+            ARMNRS* argR;
+            UInt size;
+            Bool Q;
+         } NBinaryS;
+         struct {
+            ARMNeonShiftOp op;
+            HReg dst;
+            HReg argL;
+            HReg argR;
+            UInt size;
+            Bool Q;
+         } NShift;
+         struct {
+            Bool isLoad;
+            HReg dQ;
+            ARMAModeN *amode;
+         } NLdStQ;
+         struct {
+            Bool isLoad;
+            HReg dD;
+            ARMAModeN *amode;
+         } NLdStD;
+         struct {
+            ARMNeonUnOpS op;
+            ARMNRS*  dst;
+            ARMNRS*  src;
+            UInt size;
+            Bool Q;
+         } NUnaryS;
+         struct {
+            ARMNeonUnOp op;
+            HReg  dst;
+            HReg  src;
+            UInt size;
+            Bool Q;
+         } NUnary;
+         /* Takes two arguments and modifies them both. */
+         struct {
+            ARMNeonDualOp op;
+            HReg  arg1;
+            HReg  arg2;
+            UInt size;
+            Bool Q;
+         } NDual;
+         struct {
+            HReg dst;
+            ARMNImm* imm;
+         } NeonImm;
+         /* 128-bit Neon move src to dst on the given condition, which
+            may not be ARMcc_AL. */
+         struct {
+            ARMCondCode cond;
+            HReg        dst;
+            HReg        src;
+         } NCMovQ;
+         struct {
+            /* Note: rD != rN */
+            HReg rD;
+            HReg rN;
+            UInt imm32;
+         } Add32;
+      } ARMin;
+   }
+   ARMInstr;
+
+
+extern ARMInstr* ARMInstr_Alu      ( ARMAluOp, HReg, HReg, ARMRI84* );
+extern ARMInstr* ARMInstr_Shift    ( ARMShiftOp, HReg, HReg, ARMRI5* );
+extern ARMInstr* ARMInstr_Unary    ( ARMUnaryOp, HReg, HReg );
+extern ARMInstr* ARMInstr_CmpOrTst ( Bool isCmp, HReg, ARMRI84* );
+extern ARMInstr* ARMInstr_Mov      ( HReg, ARMRI84* );
+extern ARMInstr* ARMInstr_Imm32    ( HReg, UInt );
+extern ARMInstr* ARMInstr_LdSt32   ( Bool isLoad, HReg, ARMAMode1* );
+extern ARMInstr* ARMInstr_LdSt16   ( Bool isLoad, Bool signedLoad,
+                                     HReg, ARMAMode2* );
+extern ARMInstr* ARMInstr_LdSt8U   ( Bool isLoad, HReg, ARMAMode1* );
+extern ARMInstr* ARMInstr_Ld8S     ( HReg, ARMAMode2* );
+extern ARMInstr* ARMInstr_Goto     ( IRJumpKind, ARMCondCode, HReg gnext );
+extern ARMInstr* ARMInstr_CMov     ( ARMCondCode, HReg dst, ARMRI84* src );
+extern ARMInstr* ARMInstr_Call     ( ARMCondCode, HWord, Int nArgRegs );
+extern ARMInstr* ARMInstr_Mul      ( ARMMulOp op );
+extern ARMInstr* ARMInstr_LdrEX    ( Int szB );
+extern ARMInstr* ARMInstr_StrEX    ( Int szB );
+extern ARMInstr* ARMInstr_VLdStD   ( Bool isLoad, HReg, ARMAModeV* );
+extern ARMInstr* ARMInstr_VLdStS   ( Bool isLoad, HReg, ARMAModeV* );
+extern ARMInstr* ARMInstr_VAluD    ( ARMVfpOp op, HReg, HReg, HReg );
+extern ARMInstr* ARMInstr_VAluS    ( ARMVfpOp op, HReg, HReg, HReg );
+extern ARMInstr* ARMInstr_VUnaryD  ( ARMVfpUnaryOp, HReg dst, HReg src );
+extern ARMInstr* ARMInstr_VUnaryS  ( ARMVfpUnaryOp, HReg dst, HReg src );
+extern ARMInstr* ARMInstr_VCmpD    ( HReg argL, HReg argR );
+extern ARMInstr* ARMInstr_VCMovD   ( ARMCondCode, HReg dst, HReg src );
+extern ARMInstr* ARMInstr_VCMovS   ( ARMCondCode, HReg dst, HReg src );
+extern ARMInstr* ARMInstr_VCvtSD   ( Bool sToD, HReg dst, HReg src );
+extern ARMInstr* ARMInstr_VXferD   ( Bool toD, HReg dD, HReg rHi, HReg rLo );
+extern ARMInstr* ARMInstr_VXferS   ( Bool toS, HReg fD, HReg rLo );
+extern ARMInstr* ARMInstr_VCvtID   ( Bool iToD, Bool syned,
+                                     HReg dst, HReg src );
+extern ARMInstr* ARMInstr_FPSCR    ( Bool toFPSCR, HReg iReg );
+extern ARMInstr* ARMInstr_MFence   ( void );
+extern ARMInstr* ARMInstr_NLdStQ   ( Bool isLoad, HReg, ARMAModeN* );
+extern ARMInstr* ARMInstr_NLdStD   ( Bool isLoad, HReg, ARMAModeN* );
+extern ARMInstr* ARMInstr_NUnary   ( ARMNeonUnOp, HReg, HReg, UInt, Bool );
+extern ARMInstr* ARMInstr_NUnaryS  ( ARMNeonUnOp, ARMNRS*, ARMNRS*,
+                                     UInt, Bool );
+extern ARMInstr* ARMInstr_NDual    ( ARMNeonDualOp, HReg, HReg, UInt, Bool );
+extern ARMInstr* ARMInstr_NBinary  ( ARMNeonBinOp, HReg, HReg, HReg,
+                                     UInt, Bool );
+extern ARMInstr* ARMInstr_NShift   ( ARMNeonShiftOp, HReg, HReg, HReg,
+                                     UInt, Bool );
+extern ARMInstr* ARMInstr_NeonImm  ( HReg, ARMNImm* );
+extern ARMInstr* ARMInstr_NCMovQ   ( ARMCondCode, HReg, HReg );
+extern ARMInstr* ARMInstr_Add32    ( HReg rD, HReg rN, UInt imm32 );
+
+extern void ppARMInstr ( ARMInstr* );
+
+
+/* Some functions that insulate the register allocator from details
+   of the underlying instruction set. */
+extern void getRegUsage_ARMInstr ( HRegUsage*, ARMInstr*, Bool );
+extern void mapRegs_ARMInstr     ( HRegRemap*, ARMInstr*, Bool );
+extern Bool isMove_ARMInstr      ( ARMInstr*, HReg*, HReg* );
+extern Int  emit_ARMInstr        ( UChar* buf, Int nbuf, ARMInstr*, 
+                                   Bool, void* dispatch );
+
+extern void genSpill_ARM  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                            HReg rreg, Int offset, Bool );
+extern void genReload_ARM ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                            HReg rreg, Int offset, Bool );
+
+extern void getAllocableRegs_ARM ( Int*, HReg** );
+extern HInstrArray* iselSB_ARM   ( IRSB*, VexArch,
+                                   VexArchInfo*, VexAbiInfo* );
+
+#endif /* ndef __VEX_HOST_ARM_DEFS_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                     host_arm_defs.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_arm_isel.c b/VEX/priv/host_arm_isel.c
new file mode 100644
index 0000000..4bba9a3
--- /dev/null
+++ b/VEX/priv/host_arm_isel.c

@@ -0,0 +1,6023 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                   host_arm_isel.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   NEON support is
+   Copyright (C) 2010-2010 Samsung Electronics
+   contributed by Dmitry Zhurikhin <zhur@ispras.ru>
+              and Kirill Batuzov <batuzovk@ispras.ru>
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+#include "ir_match.h"
+
+#include "main_util.h"
+#include "main_globals.h"
+#include "host_generic_regs.h"
+#include "host_generic_simd64.h"  // for 32-bit SIMD helpers
+#include "host_arm_defs.h"
+
+
+/*---------------------------------------------------------*/
+/*--- ARMvfp control word stuff                         ---*/
+/*---------------------------------------------------------*/
+
+/* Vex-generated code expects to run with the FPU set as follows: all
+   exceptions masked, round-to-nearest, non-vector mode, with the NZCV
+   flags cleared, and FZ (flush to zero) disabled.  Curiously enough,
+   this corresponds to a FPSCR value of zero.
+
+   fpscr should therefore be zero on entry to Vex-generated code, and
+   should be unchanged at exit.  (Or at least the bottom 28 bits
+   should be zero).
+*/
+
+#define DEFAULT_FPSCR 0
+
+
+/*---------------------------------------------------------*/
+/*--- ISelEnv                                           ---*/
+/*---------------------------------------------------------*/
+
+/* This carries around:
+
+   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
+     might encounter.  This is computed before insn selection starts,
+     and does not change.
+
+   - A mapping from IRTemp to HReg.  This tells the insn selector
+     which virtual register(s) are associated with each IRTemp
+     temporary.  This is computed before insn selection starts, and
+     does not change.  We expect this mapping to map precisely the
+     same set of IRTemps as the type mapping does.
+
+        - vregmap   holds the primary register for the IRTemp.
+        - vregmapHI is only used for 64-bit integer-typed
+             IRTemps.  It holds the identity of a second
+             32-bit virtual HReg, which holds the high half
+             of the value.
+
+   - The name of the vreg in which we stash a copy of the link reg, so
+     helper functions don't kill it.
+
+   - The code array, that is, the insns selected so far.
+
+   - A counter, for generating new virtual registers.
+
+   - The host hardware capabilities word.  This is set at the start
+     and does not change.
+
+   Note, this is all host-independent.  */
+
+typedef
+   struct {
+      IRTypeEnv*   type_env;
+
+      HReg*        vregmap;
+      HReg*        vregmapHI;
+      Int          n_vregmap;
+
+      HReg         savedLR;
+
+      HInstrArray* code;
+
+      Int          vreg_ctr;
+
+      UInt         hwcaps;
+   }
+   ISelEnv;
+
+static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
+{
+   vassert(tmp >= 0);
+   vassert(tmp < env->n_vregmap);
+   return env->vregmap[tmp];
+}
+
+static void lookupIRTemp64 ( HReg* vrHI, HReg* vrLO, ISelEnv* env, IRTemp tmp )
+{
+   vassert(tmp >= 0);
+   vassert(tmp < env->n_vregmap);
+   vassert(env->vregmapHI[tmp] != INVALID_HREG);
+   *vrLO = env->vregmap[tmp];
+   *vrHI = env->vregmapHI[tmp];
+}
+
+static void addInstr ( ISelEnv* env, ARMInstr* instr )
+{
+   addHInstr(env->code, instr);
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      ppARMInstr(instr);
+      vex_printf("\n");
+   }
+#if 0
+   if (instr->tag == ARMin_NUnary || instr->tag == ARMin_NBinary
+         || instr->tag == ARMin_NUnaryS || instr->tag == ARMin_NBinaryS
+         || instr->tag == ARMin_NDual || instr->tag == ARMin_NShift) {
+      ppARMInstr(instr);
+      vex_printf("\n");
+   }
+#endif
+}
+
+static HReg newVRegI ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcInt32, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+static HReg newVRegD ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+static HReg newVRegF ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcFlt32, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+static HReg newVRegV ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+/* These are duplicated in guest_arm_toIR.c */
+static IRExpr* unop ( IROp op, IRExpr* a )
+{
+   return IRExpr_Unop(op, a);
+}
+
+static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
+{
+   return IRExpr_Binop(op, a1, a2);
+}
+
+static IRExpr* bind ( Int binder )
+{
+   return IRExpr_Binder(binder);
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Forward declarations                        ---*/
+/*---------------------------------------------------------*/
+
+/* These are organised as iselXXX and iselXXX_wrk pairs.  The
+   iselXXX_wrk do the real work, but are not to be called directly.
+   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
+   checks that all returned registers are virtual.  You should not
+   call the _wrk version directly.
+*/
+static ARMAMode1*  iselIntExpr_AMode1_wrk ( ISelEnv* env, IRExpr* e );
+static ARMAMode1*  iselIntExpr_AMode1     ( ISelEnv* env, IRExpr* e );
+
+static ARMAMode2*  iselIntExpr_AMode2_wrk ( ISelEnv* env, IRExpr* e );
+static ARMAMode2*  iselIntExpr_AMode2     ( ISelEnv* env, IRExpr* e );
+
+static ARMAModeV*  iselIntExpr_AModeV_wrk ( ISelEnv* env, IRExpr* e );
+static ARMAModeV*  iselIntExpr_AModeV     ( ISelEnv* env, IRExpr* e );
+
+static ARMAModeN* iselIntExpr_AModeN_wrk  ( ISelEnv* env, IRExpr* e );
+static ARMAModeN* iselIntExpr_AModeN      ( ISelEnv* env, IRExpr* e );
+
+static ARMRI84*    iselIntExpr_RI84_wrk
+        ( /*OUT*/Bool* didInv, Bool mayInv, ISelEnv* env, IRExpr* e );
+static ARMRI84*    iselIntExpr_RI84
+        ( /*OUT*/Bool* didInv, Bool mayInv, ISelEnv* env, IRExpr* e );
+
+static ARMRI5*     iselIntExpr_RI5_wrk    ( ISelEnv* env, IRExpr* e );
+static ARMRI5*     iselIntExpr_RI5        ( ISelEnv* env, IRExpr* e );
+
+static ARMCondCode iselCondCode_wrk       ( ISelEnv* env, IRExpr* e );
+static ARMCondCode iselCondCode           ( ISelEnv* env, IRExpr* e );
+
+static HReg        iselIntExpr_R_wrk      ( ISelEnv* env, IRExpr* e );
+static HReg        iselIntExpr_R          ( ISelEnv* env, IRExpr* e );
+
+static void        iselInt64Expr_wrk      ( HReg* rHi, HReg* rLo, 
+                                            ISelEnv* env, IRExpr* e );
+static void        iselInt64Expr          ( HReg* rHi, HReg* rLo, 
+                                            ISelEnv* env, IRExpr* e );
+
+static HReg        iselDblExpr_wrk        ( ISelEnv* env, IRExpr* e );
+static HReg        iselDblExpr            ( ISelEnv* env, IRExpr* e );
+
+static HReg        iselFltExpr_wrk        ( ISelEnv* env, IRExpr* e );
+static HReg        iselFltExpr            ( ISelEnv* env, IRExpr* e );
+
+static HReg        iselNeon64Expr_wrk     ( ISelEnv* env, IRExpr* e );
+static HReg        iselNeon64Expr         ( ISelEnv* env, IRExpr* e );
+
+static HReg        iselNeonExpr_wrk       ( ISelEnv* env, IRExpr* e );
+static HReg        iselNeonExpr           ( ISelEnv* env, IRExpr* e );
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Misc helpers                                ---*/
+/*---------------------------------------------------------*/
+
+static UInt ROR32 ( UInt x, UInt sh ) {
+   vassert(sh >= 0 && sh < 32);
+   if (sh == 0)
+      return x;
+   else
+      return (x << (32-sh)) | (x >> sh);
+}
+
+/* Figure out if 'u' fits in the special shifter-operand 8x4 immediate
+   form, and if so return the components. */
+static Bool fitsIn8x4 ( /*OUT*/UInt* u8, /*OUT*/UInt* u4, UInt u )
+{
+   UInt i;
+   for (i = 0; i < 16; i++) {
+      if (0 == (u & 0xFFFFFF00)) {
+         *u8 = u;
+         *u4 = i;
+         return True;
+      }
+      u = ROR32(u, 30);
+   }
+   vassert(i == 16);
+   return False;
+}
+
+/* Make a int reg-reg move. */
+static ARMInstr* mk_iMOVds_RR ( HReg dst, HReg src )
+{
+   vassert(hregClass(src) == HRcInt32);
+   vassert(hregClass(dst) == HRcInt32);
+   return ARMInstr_Mov(dst, ARMRI84_R(src));
+}
+
+/* Set the VFP unit's rounding mode to default (round to nearest). */
+static void set_VFP_rounding_default ( ISelEnv* env )
+{
+   /* mov rTmp, #DEFAULT_FPSCR
+      fmxr fpscr, rTmp
+   */
+   HReg rTmp = newVRegI(env);
+   addInstr(env, ARMInstr_Imm32(rTmp, DEFAULT_FPSCR));
+   addInstr(env, ARMInstr_FPSCR(True/*toFPSCR*/, rTmp));
+}
+
+/* Mess with the VFP unit's rounding mode: 'mode' is an I32-typed
+   expression denoting a value in the range 0 .. 3, indicating a round
+   mode encoded as per type IRRoundingMode.  Set FPSCR to have the
+   same rounding.
+*/
+static
+void set_VFP_rounding_mode ( ISelEnv* env, IRExpr* mode )
+{
+   /* This isn't simple, because 'mode' carries an IR rounding
+      encoding, and we need to translate that to an ARMvfp one:
+      The IR encoding:
+         00  to nearest (the default)
+         10  to +infinity
+         01  to -infinity
+         11  to zero
+      The ARMvfp encoding:
+         00  to nearest
+         01  to +infinity
+         10  to -infinity
+         11  to zero
+      Easy enough to do; just swap the two bits.
+   */
+   HReg irrm = iselIntExpr_R(env, mode);
+   HReg tL   = newVRegI(env);
+   HReg tR   = newVRegI(env);
+   HReg t3   = newVRegI(env);
+   /* tL = irrm << 1;
+      tR = irrm >> 1;  if we're lucky, these will issue together
+      tL &= 2;
+      tR &= 1;         ditto
+      t3 = tL | tR;
+      t3 <<= 22;
+      fmxr fpscr, t3
+   */
+   addInstr(env, ARMInstr_Shift(ARMsh_SHL, tL, irrm, ARMRI5_I5(1)));
+   addInstr(env, ARMInstr_Shift(ARMsh_SHR, tR, irrm, ARMRI5_I5(1)));
+   addInstr(env, ARMInstr_Alu(ARMalu_AND, tL, tL, ARMRI84_I84(2,0)));
+   addInstr(env, ARMInstr_Alu(ARMalu_AND, tR, tR, ARMRI84_I84(1,0)));
+   addInstr(env, ARMInstr_Alu(ARMalu_OR, t3, tL, ARMRI84_R(tR)));
+   addInstr(env, ARMInstr_Shift(ARMsh_SHL, t3, t3, ARMRI5_I5(22)));
+   addInstr(env, ARMInstr_FPSCR(True/*toFPSCR*/, t3));
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Function call helpers                       ---*/
+/*---------------------------------------------------------*/
+
+/* Used only in doHelperCall.  See big comment in doHelperCall re
+   handling of register-parameter args.  This function figures out
+   whether evaluation of an expression might require use of a fixed
+   register.  If in doubt return True (safe but suboptimal).
+*/
+static
+Bool mightRequireFixedRegs ( IRExpr* e )
+{
+   switch (e->tag) {
+   case Iex_RdTmp: case Iex_Const: case Iex_Get:
+      return False;
+   default:
+      return True;
+   }
+}
+
+
+/* Do a complete function call.  guard is a Ity_Bit expression
+   indicating whether or not the call happens.  If guard==NULL, the
+   call is unconditional.  Returns True iff it managed to handle this
+   combination of arg/return types, else returns False. */
+
+static
+Bool doHelperCall ( ISelEnv* env,
+                    Bool passBBP,
+                    IRExpr* guard, IRCallee* cee, IRExpr** args )
+{
+   ARMCondCode cc;
+   HReg        argregs[ARM_N_ARGREGS];
+   HReg        tmpregs[ARM_N_ARGREGS];
+   Bool        go_fast;
+   Int         n_args, i, nextArgReg;
+   ULong       target;
+
+   vassert(ARM_N_ARGREGS == 4);
+
+   /* Marshal args for a call and do the call.
+
+      If passBBP is True, r8 (the baseblock pointer) is to be passed
+      as the first arg.
+
+      This function only deals with a tiny set of possibilities, which
+      cover all helpers in practice.  The restrictions are that only
+      arguments in registers are supported, hence only ARM_N_REGPARMS
+      x 32 integer bits in total can be passed.  In fact the only
+      supported arg types are I32 and I64.
+
+      Generating code which is both efficient and correct when
+      parameters are to be passed in registers is difficult, for the
+      reasons elaborated in detail in comments attached to
+      doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
+      of the method described in those comments.
+
+      The problem is split into two cases: the fast scheme and the
+      slow scheme.  In the fast scheme, arguments are computed
+      directly into the target (real) registers.  This is only safe
+      when we can be sure that computation of each argument will not
+      trash any real registers set by computation of any other
+      argument.
+
+      In the slow scheme, all args are first computed into vregs, and
+      once they are all done, they are moved to the relevant real
+      regs.  This always gives correct code, but it also gives a bunch
+      of vreg-to-rreg moves which are usually redundant but are hard
+      for the register allocator to get rid of.
+
+      To decide which scheme to use, all argument expressions are
+      first examined.  If they are all so simple that it is clear they
+      will be evaluated without use of any fixed registers, use the
+      fast scheme, else use the slow scheme.  Note also that only
+      unconditional calls may use the fast scheme, since having to
+      compute a condition expression could itself trash real
+      registers.
+
+      Note this requires being able to examine an expression and
+      determine whether or not evaluation of it might use a fixed
+      register.  That requires knowledge of how the rest of this insn
+      selector works.  Currently just the following 3 are regarded as
+      safe -- hopefully they cover the majority of arguments in
+      practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
+   */
+
+   /* Note that the cee->regparms field is meaningless on ARM hosts
+      (since there is only one calling convention) and so we always
+      ignore it. */
+
+   n_args = 0;
+   for (i = 0; args[i]; i++)
+      n_args++;
+
+   argregs[0] = hregARM_R0();
+   argregs[1] = hregARM_R1();
+   argregs[2] = hregARM_R2();
+   argregs[3] = hregARM_R3();
+
+   tmpregs[0] = tmpregs[1] = tmpregs[2] =
+   tmpregs[3] = INVALID_HREG;
+
+   /* First decide which scheme (slow or fast) is to be used.  First
+      assume the fast scheme, and select slow if any contraindications
+      (wow) appear. */
+
+   go_fast = True;
+
+   if (guard) {
+      if (guard->tag == Iex_Const
+          && guard->Iex.Const.con->tag == Ico_U1
+          && guard->Iex.Const.con->Ico.U1 == True) {
+         /* unconditional */
+      } else {
+         /* Not manifestly unconditional -- be conservative. */
+         go_fast = False;
+      }
+   }
+
+   if (go_fast) {
+      for (i = 0; i < n_args; i++) {
+         if (mightRequireFixedRegs(args[i])) {
+            go_fast = False;
+            break;
+         }
+      }
+   }
+   /* At this point the scheme to use has been established.  Generate
+      code to get the arg values into the argument rregs.  If we run
+      out of arg regs, give up. */
+
+   if (go_fast) {
+
+      /* FAST SCHEME */
+      nextArgReg = 0;
+      if (passBBP) {
+         addInstr(env, mk_iMOVds_RR( argregs[nextArgReg],
+                                     hregARM_R8() ));
+         nextArgReg++;
+      }
+
+      for (i = 0; i < n_args; i++) {
+         IRType aTy = typeOfIRExpr(env->type_env, args[i]);
+         if (nextArgReg >= ARM_N_ARGREGS)
+            return False; /* out of argregs */
+         if (aTy == Ity_I32) {
+            addInstr(env, mk_iMOVds_RR( argregs[nextArgReg],
+                                        iselIntExpr_R(env, args[i]) ));
+            nextArgReg++;
+         }
+         else if (aTy == Ity_I64) {
+            /* 64-bit args must be passed in an a reg-pair of the form
+               n:n+1, where n is even.  Hence either r0:r1 or r2:r3.
+               On a little-endian host, the less significant word is
+               passed in the lower-numbered register. */
+            if (nextArgReg & 1) {
+               if (nextArgReg >= ARM_N_ARGREGS)
+                  return False; /* out of argregs */
+               addInstr(env, ARMInstr_Imm32( argregs[nextArgReg], 0xAA ));
+               nextArgReg++;
+            }
+            if (nextArgReg >= ARM_N_ARGREGS)
+               return False; /* out of argregs */
+            HReg raHi, raLo;
+            iselInt64Expr(&raHi, &raLo, env, args[i]);
+            addInstr(env, mk_iMOVds_RR( argregs[nextArgReg], raLo ));
+            nextArgReg++;
+            addInstr(env, mk_iMOVds_RR( argregs[nextArgReg], raHi ));
+            nextArgReg++;
+         }
+         else
+            return False; /* unhandled arg type */
+      }
+
+      /* Fast scheme only applies for unconditional calls.  Hence: */
+      cc = ARMcc_AL;
+
+   } else {
+
+      /* SLOW SCHEME; move via temporaries */
+      nextArgReg = 0;
+
+      if (passBBP) {
+         /* This is pretty stupid; better to move directly to r0
+            after the rest of the args are done. */
+         tmpregs[nextArgReg] = newVRegI(env);
+         addInstr(env, mk_iMOVds_RR( tmpregs[nextArgReg],
+                                     hregARM_R8() ));
+         nextArgReg++;
+      }
+
+      for (i = 0; i < n_args; i++) {
+         IRType aTy = typeOfIRExpr(env->type_env, args[i]);
+         if (nextArgReg >= ARM_N_ARGREGS)
+            return False; /* out of argregs */
+         if (aTy == Ity_I32) {
+            tmpregs[nextArgReg] = iselIntExpr_R(env, args[i]);
+            nextArgReg++;
+         }
+         else if (aTy == Ity_I64) {
+            /* Same comment applies as in the Fast-scheme case. */
+            if (nextArgReg & 1)
+               nextArgReg++;
+            if (nextArgReg + 1 >= ARM_N_ARGREGS)
+               return False; /* out of argregs */
+            HReg raHi, raLo;
+            iselInt64Expr(&raHi, &raLo, env, args[i]);
+            tmpregs[nextArgReg] = raLo;
+            nextArgReg++;
+            tmpregs[nextArgReg] = raHi;
+            nextArgReg++;
+         }
+      }
+
+      /* Now we can compute the condition.  We can't do it earlier
+         because the argument computations could trash the condition
+         codes.  Be a bit clever to handle the common case where the
+         guard is 1:Bit. */
+      cc = ARMcc_AL;
+      if (guard) {
+         if (guard->tag == Iex_Const
+             && guard->Iex.Const.con->tag == Ico_U1
+             && guard->Iex.Const.con->Ico.U1 == True) {
+            /* unconditional -- do nothing */
+         } else {
+            cc = iselCondCode( env, guard );
+         }
+      }
+
+      /* Move the args to their final destinations. */
+      for (i = 0; i < nextArgReg; i++) {
+         if (tmpregs[i] == INVALID_HREG) { // Skip invalid regs
+            addInstr(env, ARMInstr_Imm32( argregs[i], 0xAA ));
+            continue;
+         }
+         /* None of these insns, including any spill code that might
+            be generated, may alter the condition codes. */
+         addInstr( env, mk_iMOVds_RR( argregs[i], tmpregs[i] ) );
+      }
+
+   }
+
+   /* Should be assured by checks above */
+   vassert(nextArgReg <= ARM_N_ARGREGS);
+
+   target = (HWord)Ptr_to_ULong(cee->addr);
+
+   /* nextArgReg doles out argument registers.  Since these are
+      assigned in the order r0, r1, r2, r3, its numeric value at this
+      point, which must be between 0 and 4 inclusive, is going to be
+      equal to the number of arg regs in use for the call.  Hence bake
+      that number into the call (we'll need to know it when doing
+      register allocation, to know what regs the call reads.)
+
+      There is a bit of a twist -- harmless but worth recording.
+      Suppose the arg types are (Ity_I32, Ity_I64).  Then we will have
+      the first arg in r0 and the second in r3:r2, but r1 isn't used.
+      We nevertheless have nextArgReg==4 and bake that into the call
+      instruction.  This will mean the register allocator wil believe
+      this insn reads r1 when in fact it doesn't.  But that's
+      harmless; it just artificially extends the live range of r1
+      unnecessarily.  The best fix would be to put into the
+      instruction, a bitmask indicating which of r0/1/2/3 carry live
+      values.  But that's too much hassle. */
+
+   /* Finally, the call itself. */
+   addInstr(env, ARMInstr_Call( cc, target, nextArgReg ));
+
+   return True; /* success */
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expressions (32/16/8 bit)           ---*/
+/*---------------------------------------------------------*/
+
+/* Select insns for an integer-typed expression, and add them to the
+   code list.  Return a reg holding the result.  This reg will be a
+   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
+   want to modify it, ask for a new vreg, copy it in there, and modify
+   the copy.  The register allocator will do its best to map both
+   vregs to the same real register, so the copies will often disappear
+   later in the game.
+
+   This should handle expressions of 32, 16 and 8-bit type.  All
+   results are returned in a 32-bit register.  For 16- and 8-bit
+   expressions, the upper 16/24 bits are arbitrary, so you should mask
+   or sign extend partial values if necessary.
+*/
+
+/* --------------------- AMode1 --------------------- */
+
+/* Return an AMode1 which computes the value of the specified
+   expression, possibly also adding insns to the code list as a
+   result.  The expression may only be a 32-bit one.
+*/
+
+static Bool sane_AMode1 ( ARMAMode1* am )
+{
+   switch (am->tag) {
+      case ARMam1_RI:
+         return
+            toBool( hregClass(am->ARMam1.RI.reg) == HRcInt32
+                    && (hregIsVirtual(am->ARMam1.RI.reg)
+                        || am->ARMam1.RI.reg == hregARM_R8())
+                    && am->ARMam1.RI.simm13 >= -4095
+                    && am->ARMam1.RI.simm13 <= 4095 );
+      case ARMam1_RRS:
+         return
+            toBool( hregClass(am->ARMam1.RRS.base) == HRcInt32
+                    && hregIsVirtual(am->ARMam1.RRS.base)
+                    && hregClass(am->ARMam1.RRS.index) == HRcInt32
+                    && hregIsVirtual(am->ARMam1.RRS.index)
+                    && am->ARMam1.RRS.shift >= 0
+                    && am->ARMam1.RRS.shift <= 3 );
+      default:
+         vpanic("sane_AMode: unknown ARM AMode1 tag");
+   }
+}
+
+static ARMAMode1* iselIntExpr_AMode1 ( ISelEnv* env, IRExpr* e )
+{
+   ARMAMode1* am = iselIntExpr_AMode1_wrk(env, e);
+   vassert(sane_AMode1(am));
+   return am;
+}
+
+static ARMAMode1* iselIntExpr_AMode1_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I32);
+
+   /* FIXME: add RRS matching */
+
+   /* {Add32,Sub32}(expr,simm13) */
+   if (e->tag == Iex_Binop
+       && (e->Iex.Binop.op == Iop_Add32 || e->Iex.Binop.op == Iop_Sub32)
+       && e->Iex.Binop.arg2->tag == Iex_Const
+       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
+      Int simm = (Int)e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
+      if (simm >= -4095 && simm <= 4095) {
+         HReg reg;
+         if (e->Iex.Binop.op == Iop_Sub32)
+            simm = -simm;
+         reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         return ARMAMode1_RI(reg, simm);
+      }
+   }
+
+   /* Doesn't match anything in particular.  Generate it into
+      a register and use that. */
+   {
+      HReg reg = iselIntExpr_R(env, e);
+      return ARMAMode1_RI(reg, 0);
+   }
+
+}
+
+
+/* --------------------- AMode2 --------------------- */
+
+/* Return an AMode2 which computes the value of the specified
+   expression, possibly also adding insns to the code list as a
+   result.  The expression may only be a 32-bit one.
+*/
+
+static Bool sane_AMode2 ( ARMAMode2* am )
+{
+   switch (am->tag) {
+      case ARMam2_RI:
+         return
+            toBool( hregClass(am->ARMam2.RI.reg) == HRcInt32
+                    && hregIsVirtual(am->ARMam2.RI.reg)
+                    && am->ARMam2.RI.simm9 >= -255
+                    && am->ARMam2.RI.simm9 <= 255 );
+      case ARMam2_RR:
+         return
+            toBool( hregClass(am->ARMam2.RR.base) == HRcInt32
+                    && hregIsVirtual(am->ARMam2.RR.base)
+                    && hregClass(am->ARMam2.RR.index) == HRcInt32
+                    && hregIsVirtual(am->ARMam2.RR.index) );
+      default:
+         vpanic("sane_AMode: unknown ARM AMode2 tag");
+   }
+}
+
+static ARMAMode2* iselIntExpr_AMode2 ( ISelEnv* env, IRExpr* e )
+{
+   ARMAMode2* am = iselIntExpr_AMode2_wrk(env, e);
+   vassert(sane_AMode2(am));
+   return am;
+}
+
+static ARMAMode2* iselIntExpr_AMode2_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I32);
+
+   /* FIXME: add RR matching */
+
+   /* {Add32,Sub32}(expr,simm8) */
+   if (e->tag == Iex_Binop
+       && (e->Iex.Binop.op == Iop_Add32 || e->Iex.Binop.op == Iop_Sub32)
+       && e->Iex.Binop.arg2->tag == Iex_Const
+       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
+      Int simm = (Int)e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
+      if (simm >= -255 && simm <= 255) {
+         HReg reg;
+         if (e->Iex.Binop.op == Iop_Sub32)
+            simm = -simm;
+         reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         return ARMAMode2_RI(reg, simm);
+      }
+   }
+
+   /* Doesn't match anything in particular.  Generate it into
+      a register and use that. */
+   {
+      HReg reg = iselIntExpr_R(env, e);
+      return ARMAMode2_RI(reg, 0);
+   }
+
+}
+
+
+/* --------------------- AModeV --------------------- */
+
+/* Return an AModeV which computes the value of the specified
+   expression, possibly also adding insns to the code list as a
+   result.  The expression may only be a 32-bit one.
+*/
+
+static Bool sane_AModeV ( ARMAModeV* am )
+{
+  return toBool( hregClass(am->reg) == HRcInt32
+                 && hregIsVirtual(am->reg)
+                 && am->simm11 >= -1020 && am->simm11 <= 1020
+                 && 0 == (am->simm11 & 3) );
+}
+
+static ARMAModeV* iselIntExpr_AModeV ( ISelEnv* env, IRExpr* e )
+{
+   ARMAModeV* am = iselIntExpr_AModeV_wrk(env, e);
+   vassert(sane_AModeV(am));
+   return am;
+}
+
+static ARMAModeV* iselIntExpr_AModeV_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I32);
+
+   /* {Add32,Sub32}(expr, simm8 << 2) */
+   if (e->tag == Iex_Binop
+       && (e->Iex.Binop.op == Iop_Add32 || e->Iex.Binop.op == Iop_Sub32)
+       && e->Iex.Binop.arg2->tag == Iex_Const
+       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
+      Int simm = (Int)e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
+      if (simm >= -1020 && simm <= 1020 && 0 == (simm & 3)) {
+         HReg reg;
+         if (e->Iex.Binop.op == Iop_Sub32)
+            simm = -simm;
+         reg = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         return mkARMAModeV(reg, simm);
+      }
+   }
+
+   /* Doesn't match anything in particular.  Generate it into
+      a register and use that. */
+   {
+      HReg reg = iselIntExpr_R(env, e);
+      return mkARMAModeV(reg, 0);
+   }
+
+}
+
+/* -------------------- AModeN -------------------- */
+
+static ARMAModeN* iselIntExpr_AModeN ( ISelEnv* env, IRExpr* e )
+{
+   return iselIntExpr_AModeN_wrk(env, e);
+}
+
+static ARMAModeN* iselIntExpr_AModeN_wrk ( ISelEnv* env, IRExpr* e )
+{
+   HReg reg = iselIntExpr_R(env, e);
+   return mkARMAModeN_R(reg);
+}
+
+
+/* --------------------- RI84 --------------------- */
+
+/* Select instructions to generate 'e' into a RI84.  If mayInv is
+   true, then the caller will also accept an I84 form that denotes
+   'not e'.  In this case didInv may not be NULL, and *didInv is set
+   to True.  This complication is so as to allow generation of an RI84
+   which is suitable for use in either an AND or BIC instruction,
+   without knowing (before this call) which one.
+*/
+static ARMRI84* iselIntExpr_RI84 ( /*OUT*/Bool* didInv, Bool mayInv,
+                                   ISelEnv* env, IRExpr* e )
+{
+   ARMRI84* ri;
+   if (mayInv)
+      vassert(didInv != NULL);
+   ri = iselIntExpr_RI84_wrk(didInv, mayInv, env, e);
+   /* sanity checks ... */
+   switch (ri->tag) {
+      case ARMri84_I84:
+         return ri;
+      case ARMri84_R:
+         vassert(hregClass(ri->ARMri84.R.reg) == HRcInt32);
+         vassert(hregIsVirtual(ri->ARMri84.R.reg));
+         return ri;
+      default:
+         vpanic("iselIntExpr_RI84: unknown arm RI84 tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static ARMRI84* iselIntExpr_RI84_wrk ( /*OUT*/Bool* didInv, Bool mayInv,
+                                       ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
+
+   if (didInv) *didInv = False;
+
+   /* special case: immediate */
+   if (e->tag == Iex_Const) {
+      UInt u, u8 = 0x100, u4 = 0x10; /* both invalid */
+      switch (e->Iex.Const.con->tag) {
+         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
+         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
+         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
+         default: vpanic("iselIntExpr_RI84.Iex_Const(armh)");
+      }
+      if (fitsIn8x4(&u8, &u4, u)) {
+         return ARMRI84_I84( (UShort)u8, (UShort)u4 );
+      }
+      if (mayInv && fitsIn8x4(&u8, &u4, ~u)) {
+         vassert(didInv);
+         *didInv = True;
+         return ARMRI84_I84( (UShort)u8, (UShort)u4 );
+      }
+      /* else fail, fall through to default case */
+   }
+
+   /* default case: calculate into a register and return that */
+   {
+      HReg r = iselIntExpr_R ( env, e );
+      return ARMRI84_R(r);
+   }
+}
+
+
+/* --------------------- RI5 --------------------- */
+
+/* Select instructions to generate 'e' into a RI5. */
+
+static ARMRI5* iselIntExpr_RI5 ( ISelEnv* env, IRExpr* e )
+{
+   ARMRI5* ri = iselIntExpr_RI5_wrk(env, e);
+   /* sanity checks ... */
+   switch (ri->tag) {
+      case ARMri5_I5:
+         return ri;
+      case ARMri5_R:
+         vassert(hregClass(ri->ARMri5.R.reg) == HRcInt32);
+         vassert(hregIsVirtual(ri->ARMri5.R.reg));
+         return ri;
+      default:
+         vpanic("iselIntExpr_RI5: unknown arm RI5 tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static ARMRI5* iselIntExpr_RI5_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I32 || ty == Ity_I8);
+
+   /* special case: immediate */
+   if (e->tag == Iex_Const) {
+      UInt u; /* both invalid */
+      switch (e->Iex.Const.con->tag) {
+         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
+         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
+         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
+         default: vpanic("iselIntExpr_RI5.Iex_Const(armh)");
+      }
+      if (u >= 1 && u <= 31) {
+         return ARMRI5_I5(u);
+      }
+      /* else fail, fall through to default case */
+   }
+
+   /* default case: calculate into a register and return that */
+   {
+      HReg r = iselIntExpr_R ( env, e );
+      return ARMRI5_R(r);
+   }
+}
+
+
+/* ------------------- CondCode ------------------- */
+
+/* Generate code to evaluated a bit-typed expression, returning the
+   condition code which would correspond when the expression would
+   notionally have returned 1. */
+
+static ARMCondCode iselCondCode ( ISelEnv* env, IRExpr* e )
+{
+   ARMCondCode cc = iselCondCode_wrk(env,e);
+   vassert(cc != ARMcc_NV);
+   return cc;
+}
+
+static ARMCondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
+{
+   vassert(e);
+   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
+
+   /* var */
+   if (e->tag == Iex_RdTmp) {
+      HReg rTmp = lookupIRTemp(env, e->Iex.RdTmp.tmp);
+      /* CmpOrTst doesn't modify rTmp; so this is OK. */
+      ARMRI84* one  = ARMRI84_I84(1,0);
+      addInstr(env, ARMInstr_CmpOrTst(False/*test*/, rTmp, one));
+      return ARMcc_NE;
+   }
+
+   /* Not1(e) */
+   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
+      /* Generate code for the arg, and negate the test condition */
+      return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
+   }
+
+   /* --- patterns rooted at: 32to1 --- */
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_32to1) {
+      HReg     rTmp = iselIntExpr_R(env, e->Iex.Unop.arg);
+      ARMRI84* one  = ARMRI84_I84(1,0);
+      addInstr(env, ARMInstr_CmpOrTst(False/*test*/, rTmp, one));
+      return ARMcc_NE;
+   }
+
+   /* --- patterns rooted at: CmpNEZ8 --- */
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_CmpNEZ8) {
+      HReg     r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
+      ARMRI84* xFF  = ARMRI84_I84(0xFF,0);
+      addInstr(env, ARMInstr_CmpOrTst(False/*!isCmp*/, r1, xFF));
+      return ARMcc_NE;
+   }
+
+   /* --- patterns rooted at: CmpNEZ32 --- */
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_CmpNEZ32) {
+      HReg     r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
+      ARMRI84* zero = ARMRI84_I84(0,0);
+      addInstr(env, ARMInstr_CmpOrTst(True/*isCmp*/, r1, zero));
+      return ARMcc_NE;
+   }
+
+   /* --- patterns rooted at: CmpNEZ64 --- */
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_CmpNEZ64) {
+      HReg     tHi, tLo;
+      HReg     tmp  = newVRegI(env);
+      ARMRI84* zero = ARMRI84_I84(0,0);
+      iselInt64Expr(&tHi, &tLo, env, e->Iex.Unop.arg);
+      addInstr(env, ARMInstr_Alu(ARMalu_OR, tmp, tHi, ARMRI84_R(tLo)));
+      addInstr(env, ARMInstr_CmpOrTst(True/*isCmp*/, tmp, zero));
+      return ARMcc_NE;
+   }
+
+   /* --- Cmp*32*(x,y) --- */
+   if (e->tag == Iex_Binop
+       && (e->Iex.Binop.op == Iop_CmpEQ32
+           || e->Iex.Binop.op == Iop_CmpNE32
+           || e->Iex.Binop.op == Iop_CmpLT32S
+           || e->Iex.Binop.op == Iop_CmpLT32U
+           || e->Iex.Binop.op == Iop_CmpLE32S
+           || e->Iex.Binop.op == Iop_CmpLE32U)) {
+      HReg     argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      ARMRI84* argR = iselIntExpr_RI84(NULL,False, 
+                                       env, e->Iex.Binop.arg2);
+      addInstr(env, ARMInstr_CmpOrTst(True/*isCmp*/, argL, argR));
+      switch (e->Iex.Binop.op) {
+         case Iop_CmpEQ32:  return ARMcc_EQ;
+         case Iop_CmpNE32:  return ARMcc_NE;
+         case Iop_CmpLT32S: return ARMcc_LT;
+         case Iop_CmpLT32U: return ARMcc_LO;
+         case Iop_CmpLE32S: return ARMcc_LE;
+         case Iop_CmpLE32U: return ARMcc_LS;
+         default: vpanic("iselCondCode(arm): CmpXX32");
+      }
+   }
+
+   /* --- CasCmpEQ* --- */
+   /* Ist_Cas has a dummy argument to compare with, so comparison is
+      always true. */
+   if (e->tag == Iex_Binop
+       && (e->Iex.Binop.op == Iop_CasCmpEQ32
+           || e->Iex.Binop.op == Iop_CasCmpEQ16
+           || e->Iex.Binop.op == Iop_CasCmpEQ8)) {
+      return ARMcc_AL;
+   }
+
+   ppIRExpr(e);
+   vpanic("iselCondCode");
+}
+
+
+/* --------------------- Reg --------------------- */
+
+static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselIntExpr_R_wrk(env, e);
+   /* sanity checks ... */
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcInt32);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
+//   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
+
+   switch (e->tag) {
+
+   /* --------- TEMP --------- */
+   case Iex_RdTmp: {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   /* --------- LOAD --------- */
+   case Iex_Load: {
+      HReg dst  = newVRegI(env);
+
+      if (e->Iex.Load.end != Iend_LE)
+         goto irreducible;
+
+      if (ty == Ity_I32) {
+         ARMAMode1* amode = iselIntExpr_AMode1 ( env, e->Iex.Load.addr );
+         addInstr(env, ARMInstr_LdSt32(True/*isLoad*/, dst, amode));
+         return dst;
+      }
+      if (ty == Ity_I16) {
+         ARMAMode2* amode = iselIntExpr_AMode2 ( env, e->Iex.Load.addr );
+         addInstr(env, ARMInstr_LdSt16(True/*isLoad*/, False/*!signedLoad*/,
+                                       dst, amode));
+         return dst;
+      }
+      if (ty == Ity_I8) {
+         ARMAMode1* amode = iselIntExpr_AMode1 ( env, e->Iex.Load.addr );
+         addInstr(env, ARMInstr_LdSt8U(True/*isLoad*/, dst, amode));
+         return dst;
+      }
+
+//zz      if (ty == Ity_I16) {
+//zz         addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
+//zz         return dst;
+//zz      }
+//zz      if (ty == Ity_I8) {
+//zz         addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
+//zz         return dst;
+//zz      }
+      break;
+   }
+
+//zz   /* --------- TERNARY OP --------- */
+//zz   case Iex_Triop: {
+//zz      /* C3210 flags following FPU partial remainder (fprem), both
+//zz         IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
+//zz      if (e->Iex.Triop.op == Iop_PRemC3210F64
+//zz          || e->Iex.Triop.op == Iop_PRem1C3210F64) {
+//zz         HReg junk = newVRegF(env);
+//zz         HReg dst  = newVRegI(env);
+//zz         HReg srcL = iselDblExpr(env, e->Iex.Triop.arg2);
+//zz         HReg srcR = iselDblExpr(env, e->Iex.Triop.arg3);
+//zz         /* XXXROUNDINGFIXME */
+//zz         /* set roundingmode here */
+//zz         addInstr(env, X86Instr_FpBinary(
+//zz                           e->Iex.Binop.op==Iop_PRemC3210F64 
+//zz                              ? Xfp_PREM : Xfp_PREM1,
+//zz                           srcL,srcR,junk
+//zz                 ));
+//zz         /* The previous pseudo-insn will have left the FPU's C3210
+//zz            flags set correctly.  So bag them. */
+//zz         addInstr(env, X86Instr_FpStSW_AX());
+//zz         addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
+//zz         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
+//zz         return dst;
+//zz      }
+//zz
+//zz      break;
+//zz   }
+
+   /* --------- BINARY OP --------- */
+   case Iex_Binop: {
+
+      ARMAluOp   aop = 0; /* invalid */
+      ARMShiftOp sop = 0; /* invalid */
+
+      /* ADD/SUB/AND/OR/XOR */
+      switch (e->Iex.Binop.op) {
+         case Iop_And32: {
+            Bool     didInv = False;
+            HReg     dst    = newVRegI(env);
+            HReg     argL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            ARMRI84* argR   = iselIntExpr_RI84(&didInv, True/*mayInv*/,
+                                               env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_Alu(didInv ? ARMalu_BIC : ARMalu_AND,
+                                       dst, argL, argR));
+            return dst;
+         }
+         case Iop_Or32:  aop = ARMalu_OR;  goto std_binop;
+         case Iop_Xor32: aop = ARMalu_XOR; goto std_binop;
+         case Iop_Sub32: aop = ARMalu_SUB; goto std_binop;
+         case Iop_Add32: aop = ARMalu_ADD; goto std_binop;
+         std_binop: {
+            HReg     dst  = newVRegI(env);
+            HReg     argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            ARMRI84* argR = iselIntExpr_RI84(NULL, False/*mayInv*/,
+                                             env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_Alu(aop, dst, argL, argR));
+            return dst;
+         }
+         default: break;
+      }
+
+      /* SHL/SHR/SAR */
+      switch (e->Iex.Binop.op) {
+         case Iop_Shl32: sop = ARMsh_SHL; goto sh_binop;
+         case Iop_Shr32: sop = ARMsh_SHR; goto sh_binop;
+         case Iop_Sar32: sop = ARMsh_SAR; goto sh_binop;
+         sh_binop: {
+            HReg    dst  = newVRegI(env);
+            HReg    argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            ARMRI5* argR = iselIntExpr_RI5(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_Shift(sop, dst, argL, argR));
+            vassert(ty == Ity_I32); /* else the IR is ill-typed */
+            return dst;
+         }
+         default: break;
+      }
+
+      /* MUL */
+      if (e->Iex.Binop.op == Iop_Mul32) {
+         HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         HReg dst  = newVRegI(env);
+         addInstr(env, mk_iMOVds_RR(hregARM_R2(), argL));
+         addInstr(env, mk_iMOVds_RR(hregARM_R3(), argR));
+         addInstr(env, ARMInstr_Mul(ARMmul_PLAIN));
+         addInstr(env, mk_iMOVds_RR(dst, hregARM_R0()));
+         return dst;
+      }
+
+      /* Handle misc other ops. */
+
+      if (e->Iex.Binop.op == Iop_Max32U) {
+         HReg argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         HReg dst  = newVRegI(env);
+         addInstr(env, ARMInstr_CmpOrTst(True/*isCmp*/, argL,
+                                         ARMRI84_R(argR)));
+         addInstr(env, mk_iMOVds_RR(dst, argL));
+         addInstr(env, ARMInstr_CMov(ARMcc_LO, dst, ARMRI84_R(argR)));
+         return dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_CmpF64) {
+         HReg dL = iselDblExpr(env, e->Iex.Binop.arg1);
+         HReg dR = iselDblExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegI(env);
+         /* Do the compare (FCMPD) and set NZCV in FPSCR.  Then also do
+            FMSTAT, so we can examine the results directly. */
+         addInstr(env, ARMInstr_VCmpD(dL, dR));
+         /* Create in dst, the IRCmpF64Result encoded result. */
+         addInstr(env, ARMInstr_Imm32(dst, 0));
+         addInstr(env, ARMInstr_CMov(ARMcc_EQ, dst, ARMRI84_I84(0x40,0))); //EQ
+         addInstr(env, ARMInstr_CMov(ARMcc_MI, dst, ARMRI84_I84(0x01,0))); //LT
+         addInstr(env, ARMInstr_CMov(ARMcc_GT, dst, ARMRI84_I84(0x00,0))); //GT
+         addInstr(env, ARMInstr_CMov(ARMcc_VS, dst, ARMRI84_I84(0x45,0))); //UN
+         return dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_F64toI32S
+          || e->Iex.Binop.op == Iop_F64toI32U) {
+         /* Wretched uglyness all round, due to having to deal
+            with rounding modes.  Oh well. */
+         /* FIXME: if arg1 is a constant indicating round-to-zero,
+            then we could skip all this arsing around with FPSCR and
+            simply emit FTO{S,U}IZD. */
+         Bool syned = e->Iex.Binop.op == Iop_F64toI32S;
+         HReg valD  = iselDblExpr(env, e->Iex.Binop.arg2);
+         set_VFP_rounding_mode(env, e->Iex.Binop.arg1);
+         /* FTO{S,U}ID valF, valD */
+         HReg valF = newVRegF(env);
+         addInstr(env, ARMInstr_VCvtID(False/*!iToD*/, syned,
+                                       valF, valD));
+         set_VFP_rounding_default(env);
+         /* VMOV dst, valF */
+         HReg dst = newVRegI(env);
+         addInstr(env, ARMInstr_VXferS(False/*!toS*/, valF, dst));
+         return dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_GetElem8x8
+          || e->Iex.Binop.op == Iop_GetElem16x4
+          || e->Iex.Binop.op == Iop_GetElem32x2) {
+         HReg res = newVRegI(env);
+         HReg arg = iselNeon64Expr(env, e->Iex.Triop.arg1);
+         UInt index, size;
+         if (e->Iex.Binop.arg2->tag != Iex_Const ||
+             typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+            vpanic("ARM target supports GetElem with constant "
+                   "second argument only\n");
+         }
+         index = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+         switch (e->Iex.Binop.op) {
+            case Iop_GetElem8x8: vassert(index < 8); size = 0; break;
+            case Iop_GetElem16x4: vassert(index < 4); size = 1; break;
+            case Iop_GetElem32x2: vassert(index < 2); size = 2; break;
+            default: vassert(0);
+         }
+         addInstr(env, ARMInstr_NUnaryS(ARMneon_GETELEMS,
+                                        mkARMNRS(ARMNRS_Reg, res, 0),
+                                        mkARMNRS(ARMNRS_Scalar, arg, index),
+                                        size, False));
+         return res;
+      }
+
+      if (e->Iex.Binop.op == Iop_GetElem8x16
+          || e->Iex.Binop.op == Iop_GetElem16x8
+          || e->Iex.Binop.op == Iop_GetElem32x4) {
+         HReg res = newVRegI(env);
+         HReg arg = iselNeonExpr(env, e->Iex.Triop.arg1);
+         UInt index, size;
+         if (e->Iex.Binop.arg2->tag != Iex_Const ||
+             typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+            vpanic("ARM target supports GetElem with constant "
+                   "second argument only\n");
+         }
+         index = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+         switch (e->Iex.Binop.op) {
+            case Iop_GetElem8x16: vassert(index < 16); size = 0; break;
+            case Iop_GetElem16x8: vassert(index < 8); size = 1; break;
+            case Iop_GetElem32x4: vassert(index < 4); size = 2; break;
+            default: vassert(0);
+         }
+         addInstr(env, ARMInstr_NUnaryS(ARMneon_GETELEMS,
+                                        mkARMNRS(ARMNRS_Reg, res, 0),
+                                        mkARMNRS(ARMNRS_Scalar, arg, index),
+                                        size, True));
+         return res;
+      }
+
+      /* All cases involving host-side helper calls. */
+      void* fn = NULL;
+      switch (e->Iex.Binop.op) {
+         case Iop_Add16x2:
+            fn = &h_generic_calc_Add16x2; break;
+         case Iop_Sub16x2:
+            fn = &h_generic_calc_Sub16x2; break;
+         case Iop_HAdd16Ux2:
+            fn = &h_generic_calc_HAdd16Ux2; break;
+         case Iop_HAdd16Sx2:
+            fn = &h_generic_calc_HAdd16Sx2; break;
+         case Iop_HSub16Ux2:
+            fn = &h_generic_calc_HSub16Ux2; break;
+         case Iop_HSub16Sx2:
+            fn = &h_generic_calc_HSub16Sx2; break;
+         case Iop_QAdd16Sx2:
+            fn = &h_generic_calc_QAdd16Sx2; break;
+         case Iop_QSub16Sx2:
+            fn = &h_generic_calc_QSub16Sx2; break;
+         case Iop_Add8x4:
+            fn = &h_generic_calc_Add8x4; break;
+         case Iop_Sub8x4:
+            fn = &h_generic_calc_Sub8x4; break;
+         case Iop_HAdd8Ux4:
+            fn = &h_generic_calc_HAdd8Ux4; break;
+         case Iop_HAdd8Sx4:
+            fn = &h_generic_calc_HAdd8Sx4; break;
+         case Iop_HSub8Ux4:
+            fn = &h_generic_calc_HSub8Ux4; break;
+         case Iop_HSub8Sx4:
+            fn = &h_generic_calc_HSub8Sx4; break;
+         case Iop_QAdd8Sx4:
+            fn = &h_generic_calc_QAdd8Sx4; break;
+         case Iop_QAdd8Ux4:
+            fn = &h_generic_calc_QAdd8Ux4; break;
+         case Iop_QSub8Sx4:
+            fn = &h_generic_calc_QSub8Sx4; break;
+         case Iop_QSub8Ux4:
+            fn = &h_generic_calc_QSub8Ux4; break;
+         case Iop_Sad8Ux4:
+            fn = &h_generic_calc_Sad8Ux4; break;
+         default:
+            break;
+      }
+
+      if (fn) {
+         HReg regL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         HReg res  = newVRegI(env);
+         addInstr(env, mk_iMOVds_RR(hregARM_R0(), regL));
+         addInstr(env, mk_iMOVds_RR(hregARM_R1(), regR));
+         addInstr(env, ARMInstr_Call( ARMcc_AL, (HWord)Ptr_to_ULong(fn), 2 ));
+         addInstr(env, mk_iMOVds_RR(res, hregARM_R0()));
+         return res;
+      }
+
+      break;
+   }
+
+   /* --------- UNARY OP --------- */
+   case Iex_Unop: {
+
+//zz      /* 1Uto8(32to1(expr32)) */
+//zz      if (e->Iex.Unop.op == Iop_1Uto8) { 
+//zz         DECLARE_PATTERN(p_32to1_then_1Uto8);
+//zz         DEFINE_PATTERN(p_32to1_then_1Uto8,
+//zz                        unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
+//zz         if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
+//zz            IRExpr* expr32 = mi.bindee[0];
+//zz            HReg dst = newVRegI(env);
+//zz            HReg src = iselIntExpr_R(env, expr32);
+//zz            addInstr(env, mk_iMOVsd_RR(src,dst) );
+//zz            addInstr(env, X86Instr_Alu32R(Xalu_AND,
+//zz                                          X86RMI_Imm(1), dst));
+//zz            return dst;
+//zz         }
+//zz      }
+//zz
+//zz      /* 8Uto32(LDle(expr32)) */
+//zz      if (e->Iex.Unop.op == Iop_8Uto32) {
+//zz         DECLARE_PATTERN(p_LDle8_then_8Uto32);
+//zz         DEFINE_PATTERN(p_LDle8_then_8Uto32,
+//zz                        unop(Iop_8Uto32,
+//zz                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
+//zz         if (matchIRExpr(&mi,p_LDle8_then_8Uto32,e)) {
+//zz            HReg dst = newVRegI(env);
+//zz            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
+//zz            addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
+//zz            return dst;
+//zz         }
+//zz      }
+//zz
+//zz      /* 8Sto32(LDle(expr32)) */
+//zz      if (e->Iex.Unop.op == Iop_8Sto32) {
+//zz         DECLARE_PATTERN(p_LDle8_then_8Sto32);
+//zz         DEFINE_PATTERN(p_LDle8_then_8Sto32,
+//zz                        unop(Iop_8Sto32,
+//zz                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
+//zz         if (matchIRExpr(&mi,p_LDle8_then_8Sto32,e)) {
+//zz            HReg dst = newVRegI(env);
+//zz            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
+//zz            addInstr(env, X86Instr_LoadEX(1,True,amode,dst));
+//zz            return dst;
+//zz         }
+//zz      }
+//zz
+//zz      /* 16Uto32(LDle(expr32)) */
+//zz      if (e->Iex.Unop.op == Iop_16Uto32) {
+//zz         DECLARE_PATTERN(p_LDle16_then_16Uto32);
+//zz         DEFINE_PATTERN(p_LDle16_then_16Uto32,
+//zz                        unop(Iop_16Uto32,
+//zz                             IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
+//zz         if (matchIRExpr(&mi,p_LDle16_then_16Uto32,e)) {
+//zz            HReg dst = newVRegI(env);
+//zz            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
+//zz            addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
+//zz            return dst;
+//zz         }
+//zz      }
+//zz
+//zz      /* 8Uto32(GET:I8) */
+//zz      if (e->Iex.Unop.op == Iop_8Uto32) {
+//zz         if (e->Iex.Unop.arg->tag == Iex_Get) {
+//zz            HReg      dst;
+//zz            X86AMode* amode;
+//zz            vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I8);
+//zz            dst = newVRegI(env);
+//zz            amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
+//zz                                hregX86_EBP());
+//zz            addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
+//zz            return dst;
+//zz         }
+//zz      }
+//zz
+//zz      /* 16to32(GET:I16) */
+//zz      if (e->Iex.Unop.op == Iop_16Uto32) {
+//zz         if (e->Iex.Unop.arg->tag == Iex_Get) {
+//zz            HReg      dst;
+//zz            X86AMode* amode;
+//zz            vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I16);
+//zz            dst = newVRegI(env);
+//zz            amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
+//zz                                hregX86_EBP());
+//zz            addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
+//zz            return dst;
+//zz         }
+//zz      }
+
+      switch (e->Iex.Unop.op) {
+         case Iop_8Uto32: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_Alu(ARMalu_AND,
+                                       dst, src, ARMRI84_I84(0xFF,0)));
+            return dst;
+         }
+//zz         case Iop_8Uto16:
+//zz         case Iop_8Uto32:
+//zz         case Iop_16Uto32: {
+//zz            HReg dst = newVRegI(env);
+//zz            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+//zz            UInt mask = e->Iex.Unop.op==Iop_16Uto32 ? 0xFFFF : 0xFF;
+//zz            addInstr(env, mk_iMOVsd_RR(src,dst) );
+//zz            addInstr(env, X86Instr_Alu32R(Xalu_AND,
+//zz                                          X86RMI_Imm(mask), dst));
+//zz            return dst;
+//zz         }
+//zz         case Iop_8Sto16:
+//zz         case Iop_8Sto32:
+         case Iop_16Uto32: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            ARMRI5* amt = ARMRI5_I5(16);
+            addInstr(env, ARMInstr_Shift(ARMsh_SHL, dst, src, amt));
+            addInstr(env, ARMInstr_Shift(ARMsh_SHR, dst, dst, amt));
+            return dst;
+         }
+         case Iop_8Sto32:
+         case Iop_16Sto32: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            ARMRI5* amt = ARMRI5_I5(e->Iex.Unop.op==Iop_16Sto32 ? 16 : 24);
+            addInstr(env, ARMInstr_Shift(ARMsh_SHL, dst, src, amt));
+            addInstr(env, ARMInstr_Shift(ARMsh_SAR, dst, dst, amt));
+            return dst;
+         }
+//zz         case Iop_Not8:
+//zz         case Iop_Not16:
+         case Iop_Not32: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_Unary(ARMun_NOT, dst, src));
+            return dst;
+         }
+         case Iop_64HIto32: {
+            HReg rHi, rLo;
+            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+            return rHi; /* and abandon rLo .. poor wee thing :-) */
+         }
+         case Iop_64to32: {
+            HReg rHi, rLo;
+            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+            return rLo; /* similar stupid comment to the above ... */
+         }
+         case Iop_64to8: {
+            HReg rHi, rLo;
+            if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+               HReg tHi = newVRegI(env);
+               HReg tLo = newVRegI(env);
+               HReg tmp = iselNeon64Expr(env, e->Iex.Unop.arg);
+               addInstr(env, ARMInstr_VXferD(False, tmp, tHi, tLo));
+               rHi = tHi;
+               rLo = tLo;
+            } else {
+               iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+            }
+            return rLo;
+         }
+//zz         case Iop_16HIto8:
+//zz         case Iop_32HIto16: {
+//zz            HReg dst  = newVRegI(env);
+//zz            HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
+//zz            Int shift = e->Iex.Unop.op == Iop_16HIto8 ? 8 : 16;
+//zz            addInstr(env, mk_iMOVsd_RR(src,dst) );
+//zz            addInstr(env, X86Instr_Sh32(Xsh_SHR, shift, dst));
+//zz            return dst;
+//zz         }
+         case Iop_1Uto32:
+         case Iop_1Uto8: {
+            HReg        dst  = newVRegI(env);
+            ARMCondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_Mov(dst, ARMRI84_I84(0,0)));
+            addInstr(env, ARMInstr_CMov(cond, dst, ARMRI84_I84(1,0)));
+            return dst;
+         }
+
+         case Iop_1Sto32: {
+            HReg        dst  = newVRegI(env);
+            ARMCondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+            ARMRI5*     amt  = ARMRI5_I5(31);
+            /* This is really rough.  We could do much better here;
+               perhaps mvn{cond} dst, #0 as the second insn?
+               (same applies to 1Sto64) */
+            addInstr(env, ARMInstr_Mov(dst, ARMRI84_I84(0,0)));
+            addInstr(env, ARMInstr_CMov(cond, dst, ARMRI84_I84(1,0)));
+            addInstr(env, ARMInstr_Shift(ARMsh_SHL, dst, dst, amt));
+            addInstr(env, ARMInstr_Shift(ARMsh_SAR, dst, dst, amt));
+            return dst;
+         }
+
+
+//zz         case Iop_1Sto8:
+//zz         case Iop_1Sto16:
+//zz         case Iop_1Sto32: {
+//zz            /* could do better than this, but for now ... */
+//zz            HReg dst         = newVRegI(env);
+//zz            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+//zz            addInstr(env, X86Instr_Set32(cond,dst));
+//zz            addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, dst));
+//zz            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
+//zz            return dst;
+//zz         }
+//zz         case Iop_Ctz32: {
+//zz            /* Count trailing zeroes, implemented by x86 'bsfl' */
+//zz            HReg dst = newVRegI(env);
+//zz            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+//zz            addInstr(env, X86Instr_Bsfr32(True,src,dst));
+//zz            return dst;
+//zz         }
+         case Iop_Clz32: {
+            /* Count leading zeroes; easy on ARM. */
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_Unary(ARMun_CLZ, dst, src));
+            return dst;
+         }
+
+         case Iop_CmpwNEZ32: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_Unary(ARMun_NEG, dst, src));
+            addInstr(env, ARMInstr_Alu(ARMalu_OR, dst, dst, ARMRI84_R(src)));
+            addInstr(env, ARMInstr_Shift(ARMsh_SAR, dst, dst, ARMRI5_I5(31)));
+            return dst;
+         }
+
+         case Iop_Left32: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_Unary(ARMun_NEG, dst, src));
+            addInstr(env, ARMInstr_Alu(ARMalu_OR, dst, dst, ARMRI84_R(src)));
+            return dst;
+         }
+
+//zz         case Iop_V128to32: {
+//zz            HReg      dst  = newVRegI(env);
+//zz            HReg      vec  = iselVecExpr(env, e->Iex.Unop.arg);
+//zz            X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
+//zz            sub_from_esp(env, 16);
+//zz            addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
+//zz            addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(esp0), dst ));
+//zz            add_to_esp(env, 16);
+//zz            return dst;
+//zz         }
+//zz
+         case Iop_ReinterpF32asI32: {
+            HReg dst = newVRegI(env);
+            HReg src = iselFltExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_VXferS(False/*!toS*/, src, dst));
+            return dst;
+         }
+
+//zz
+//zz         case Iop_16to8:
+         case Iop_32to8:
+         case Iop_32to16:
+            /* These are no-ops. */
+            return iselIntExpr_R(env, e->Iex.Unop.arg);
+
+         default:
+            break;
+      }
+
+      /* All Unop cases involving host-side helper calls. */
+      void* fn = NULL;
+      switch (e->Iex.Unop.op) {
+         case Iop_CmpNEZ16x2:
+            fn = &h_generic_calc_CmpNEZ16x2; break;
+         case Iop_CmpNEZ8x4:
+            fn = &h_generic_calc_CmpNEZ8x4; break;
+         default:
+            break;
+      }
+
+      if (fn) {
+         HReg arg = iselIntExpr_R(env, e->Iex.Unop.arg);
+         HReg res = newVRegI(env);
+         addInstr(env, mk_iMOVds_RR(hregARM_R0(), arg));
+         addInstr(env, ARMInstr_Call( ARMcc_AL, (HWord)Ptr_to_ULong(fn), 1 ));
+         addInstr(env, mk_iMOVds_RR(res, hregARM_R0()));
+         return res;
+      }
+
+      break;
+   }
+
+   /* --------- GET --------- */
+   case Iex_Get: {
+      if (ty == Ity_I32 
+          && 0 == (e->Iex.Get.offset & 3)
+          && e->Iex.Get.offset < 4096-4) {
+         HReg dst = newVRegI(env);
+         addInstr(env, ARMInstr_LdSt32(
+                          True/*isLoad*/,
+                          dst,
+                          ARMAMode1_RI(hregARM_R8(), e->Iex.Get.offset)));
+         return dst;
+      }
+//zz      if (ty == Ity_I8 || ty == Ity_I16) {
+//zz         HReg dst = newVRegI(env);
+//zz         addInstr(env, X86Instr_LoadEX(
+//zz                          toUChar(ty==Ity_I8 ? 1 : 2),
+//zz                          False,
+//zz                          X86AMode_IR(e->Iex.Get.offset,hregX86_EBP()),
+//zz                          dst));
+//zz         return dst;
+//zz      }
+      break;
+   }
+
+//zz   case Iex_GetI: {
+//zz      X86AMode* am 
+//zz         = genGuestArrayOffset(
+//zz              env, e->Iex.GetI.descr, 
+//zz                   e->Iex.GetI.ix, e->Iex.GetI.bias );
+//zz      HReg dst = newVRegI(env);
+//zz      if (ty == Ity_I8) {
+//zz         addInstr(env, X86Instr_LoadEX( 1, False, am, dst ));
+//zz         return dst;
+//zz      }
+//zz      if (ty == Ity_I32) {
+//zz         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), dst));
+//zz         return dst;
+//zz      }
+//zz      break;
+//zz   }
+
+   /* --------- CCALL --------- */
+   case Iex_CCall: {
+      HReg    dst = newVRegI(env);
+      vassert(ty == e->Iex.CCall.retty);
+
+      /* be very restrictive for now.  Only 32/64-bit ints allowed
+         for args, and 32 bits for return type. */
+      if (e->Iex.CCall.retty != Ity_I32)
+         goto irreducible;
+
+      /* Marshal args, do the call, clear stack. */
+      Bool ok = doHelperCall( env, False,
+                              NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
+      if (ok) {
+         addInstr(env, mk_iMOVds_RR(dst, hregARM_R0()));
+         return dst;
+      }
+      /* else fall through; will hit the irreducible: label */
+   }
+
+   /* --------- LITERAL --------- */
+   /* 32 literals */
+   case Iex_Const: {
+      UInt u   = 0;
+      HReg dst = newVRegI(env);
+      switch (e->Iex.Const.con->tag) {
+         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
+         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
+         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
+         default: ppIRExpr(e); vpanic("iselIntExpr_R.Iex_Const(arm)");
+      }
+      addInstr(env, ARMInstr_Imm32(dst, u));
+      return dst;
+   }
+
+   /* --------- MULTIPLEX --------- */
+   case Iex_Mux0X: {
+      IRExpr* cond = e->Iex.Mux0X.cond;
+
+      /* Mux0X( 32to8(1Uto32(ccexpr)), expr0, exprX ) */
+      if (ty == Ity_I32
+          && cond->tag == Iex_Unop
+          && cond->Iex.Unop.op == Iop_32to8
+          && cond->Iex.Unop.arg->tag == Iex_Unop
+          && cond->Iex.Unop.arg->Iex.Unop.op == Iop_1Uto32) {
+         ARMCondCode cc;
+         HReg     rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
+         ARMRI84* r0  = iselIntExpr_RI84(NULL, False, env, e->Iex.Mux0X.expr0);
+         HReg     dst = newVRegI(env);
+         addInstr(env, mk_iMOVds_RR(dst, rX));
+         cc = iselCondCode(env, cond->Iex.Unop.arg->Iex.Unop.arg);
+         addInstr(env, ARMInstr_CMov(cc ^ 1, dst, r0));
+         return dst;
+      }
+
+      /* Mux0X(cond, expr0, exprX) (general case) */
+      if (ty == Ity_I32) {
+         HReg     r8;
+         HReg     rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
+         ARMRI84* r0  = iselIntExpr_RI84(NULL, False, env, e->Iex.Mux0X.expr0);
+         HReg     dst = newVRegI(env);
+         addInstr(env, mk_iMOVds_RR(dst, rX));
+         r8 = iselIntExpr_R(env, cond);
+         addInstr(env, ARMInstr_CmpOrTst(False/*!isCmp*/, r8,
+                                         ARMRI84_I84(0xFF,0)));
+         addInstr(env, ARMInstr_CMov(ARMcc_EQ, dst, r0));
+         return dst;
+      }
+      break;
+   }
+
+   default: 
+   break;
+   } /* switch (e->tag) */
+
+   /* We get here if no pattern matched. */
+  irreducible:
+   ppIRExpr(e);
+   vpanic("iselIntExpr_R: cannot reduce tree");
+}
+
+
+/* -------------------- 64-bit -------------------- */
+
+/* Compute a 64-bit value into a register pair, which is returned as
+   the first two parameters.  As with iselIntExpr_R, these may be
+   either real or virtual regs; in any case they must not be changed
+   by subsequent code emitted by the caller.  */
+
+static void iselInt64Expr ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
+{
+   iselInt64Expr_wrk(rHi, rLo, env, e);
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(*rHi) == HRcInt32);
+   vassert(hregIsVirtual(*rHi));
+   vassert(hregClass(*rLo) == HRcInt32);
+   vassert(hregIsVirtual(*rLo));
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
+{
+   vassert(e);
+   vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
+
+   /* 64-bit literal */
+   if (e->tag == Iex_Const) {
+      ULong   w64 = e->Iex.Const.con->Ico.U64;
+      UInt    wHi = toUInt(w64 >> 32);
+      UInt    wLo = toUInt(w64);
+      HReg    tHi = newVRegI(env);
+      HReg    tLo = newVRegI(env);
+      vassert(e->Iex.Const.con->tag == Ico_U64);
+      addInstr(env, ARMInstr_Imm32(tHi, wHi));
+      addInstr(env, ARMInstr_Imm32(tLo, wLo));
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   /* read 64-bit IRTemp */
+   if (e->tag == Iex_RdTmp) {
+      if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+         HReg tHi = newVRegI(env);
+         HReg tLo = newVRegI(env);
+         HReg tmp = iselNeon64Expr(env, e);
+         addInstr(env, ARMInstr_VXferD(False, tmp, tHi, tLo));
+         *rHi = tHi;
+         *rLo = tLo;
+      } else {
+         lookupIRTemp64( rHi, rLo, env, e->Iex.RdTmp.tmp);
+      }
+      return;
+   }
+
+   /* 64-bit load */
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+      HReg      tLo, tHi, rA;
+      vassert(e->Iex.Load.ty == Ity_I64);
+      rA  = iselIntExpr_R(env, e->Iex.Load.addr);
+      tHi = newVRegI(env);
+      tLo = newVRegI(env);
+      addInstr(env, ARMInstr_LdSt32(True/*isLoad*/, tHi, ARMAMode1_RI(rA, 4)));
+      addInstr(env, ARMInstr_LdSt32(True/*isLoad*/, tLo, ARMAMode1_RI(rA, 0)));
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   /* 64-bit GET */
+   if (e->tag == Iex_Get) {
+      ARMAMode1* am0 = ARMAMode1_RI(hregARM_R8(), e->Iex.Get.offset + 0);
+      ARMAMode1* am4 = ARMAMode1_RI(hregARM_R8(), e->Iex.Get.offset + 4);
+      HReg tHi = newVRegI(env);
+      HReg tLo = newVRegI(env);
+      addInstr(env, ARMInstr_LdSt32(True/*isLoad*/, tHi, am4));
+      addInstr(env, ARMInstr_LdSt32(True/*isLoad*/, tLo, am0));
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   /* --------- BINARY ops --------- */
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+
+         /* 32 x 32 -> 64 multiply */
+         case Iop_MullS32:
+         case Iop_MullU32: {
+            HReg     argL = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            HReg     argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            HReg     tHi  = newVRegI(env);
+            HReg     tLo  = newVRegI(env);
+            ARMMulOp mop  = e->Iex.Binop.op == Iop_MullS32
+                               ? ARMmul_SX : ARMmul_ZX;
+            addInstr(env, mk_iMOVds_RR(hregARM_R2(), argL));
+            addInstr(env, mk_iMOVds_RR(hregARM_R3(), argR));
+            addInstr(env, ARMInstr_Mul(mop));
+            addInstr(env, mk_iMOVds_RR(tHi, hregARM_R1()));
+            addInstr(env, mk_iMOVds_RR(tLo, hregARM_R0()));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         case Iop_Or64: {
+            HReg xLo, xHi, yLo, yHi;
+            HReg tHi = newVRegI(env);
+            HReg tLo = newVRegI(env);
+            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_Alu(ARMalu_OR, tHi, xHi, ARMRI84_R(yHi)));
+            addInstr(env, ARMInstr_Alu(ARMalu_OR, tLo, xLo, ARMRI84_R(yLo)));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         case Iop_Add64: {
+            HReg xLo, xHi, yLo, yHi;
+            HReg tHi = newVRegI(env);
+            HReg tLo = newVRegI(env);
+            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_Alu(ARMalu_ADDS, tLo, xLo, ARMRI84_R(yLo)));
+            addInstr(env, ARMInstr_Alu(ARMalu_ADC,  tHi, xHi, ARMRI84_R(yHi)));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* 32HLto64(e1,e2) */
+         case Iop_32HLto64: {
+            *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            return;
+         }
+
+         default:
+            break;
+      }
+   }
+
+   /* --------- UNARY ops --------- */
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+
+         /* ReinterpF64asI64 */
+         case Iop_ReinterpF64asI64: {
+            HReg dstHi = newVRegI(env);
+            HReg dstLo = newVRegI(env);
+            HReg src   = iselDblExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_VXferD(False/*!toD*/, src, dstHi, dstLo));
+            *rHi = dstHi;
+            *rLo = dstLo;
+            return;
+         }
+
+         /* Left64(e) */
+         case Iop_Left64: {
+            HReg yLo, yHi;
+            HReg tHi  = newVRegI(env);
+            HReg tLo  = newVRegI(env);
+            HReg zero = newVRegI(env);
+            /* yHi:yLo = arg */
+            iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
+            /* zero = 0 */
+            addInstr(env, ARMInstr_Imm32(zero, 0));
+            /* tLo = 0 - yLo, and set carry */
+            addInstr(env, ARMInstr_Alu(ARMalu_SUBS,
+                                       tLo, zero, ARMRI84_R(yLo)));
+            /* tHi = 0 - yHi - carry */
+            addInstr(env, ARMInstr_Alu(ARMalu_SBC,
+                                       tHi, zero, ARMRI84_R(yHi)));
+            /* So now we have tHi:tLo = -arg.  To finish off, or 'arg'
+               back in, so as to give the final result 
+               tHi:tLo = arg | -arg. */
+            addInstr(env, ARMInstr_Alu(ARMalu_OR, tHi, tHi, ARMRI84_R(yHi)));
+            addInstr(env, ARMInstr_Alu(ARMalu_OR, tLo, tLo, ARMRI84_R(yLo)));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* CmpwNEZ64(e) */
+         case Iop_CmpwNEZ64: {
+            HReg srcLo, srcHi;
+            HReg tmp1 = newVRegI(env);
+            HReg tmp2 = newVRegI(env);
+            /* srcHi:srcLo = arg */
+            iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
+            /* tmp1 = srcHi | srcLo */
+            addInstr(env, ARMInstr_Alu(ARMalu_OR,
+                                       tmp1, srcHi, ARMRI84_R(srcLo)));
+            /* tmp2 = (tmp1 | -tmp1) >>s 31 */
+            addInstr(env, ARMInstr_Unary(ARMun_NEG, tmp2, tmp1));
+            addInstr(env, ARMInstr_Alu(ARMalu_OR,
+                                       tmp2, tmp2, ARMRI84_R(tmp1)));
+            addInstr(env, ARMInstr_Shift(ARMsh_SAR,
+                                         tmp2, tmp2, ARMRI5_I5(31)));
+            *rHi = tmp2;
+            *rLo = tmp2;
+            return;
+         }
+
+         case Iop_1Sto64: {
+            HReg        dst  = newVRegI(env);
+            ARMCondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+            ARMRI5*     amt  = ARMRI5_I5(31);
+            /* This is really rough.  We could do much better here;
+               perhaps mvn{cond} dst, #0 as the second insn?
+               (same applies to 1Sto32) */
+            addInstr(env, ARMInstr_Mov(dst, ARMRI84_I84(0,0)));
+            addInstr(env, ARMInstr_CMov(cond, dst, ARMRI84_I84(1,0)));
+            addInstr(env, ARMInstr_Shift(ARMsh_SHL, dst, dst, amt));
+            addInstr(env, ARMInstr_Shift(ARMsh_SAR, dst, dst, amt));
+            *rHi = dst;
+            *rLo = dst;
+            return;
+         }
+
+         default: 
+            break;
+      }
+   } /* if (e->tag == Iex_Unop) */
+
+   /* --------- MULTIPLEX --------- */
+   if (e->tag == Iex_Mux0X) {
+      IRType ty8;
+      HReg   r8, rXhi, rXlo, r0hi, r0lo, dstHi, dstLo;
+      ty8 = typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond);
+      vassert(ty8 == Ity_I8);
+      iselInt64Expr(&rXhi, &rXlo, env, e->Iex.Mux0X.exprX);
+      iselInt64Expr(&r0hi, &r0lo, env, e->Iex.Mux0X.expr0);
+      dstHi = newVRegI(env);
+      dstLo = newVRegI(env);
+      addInstr(env, mk_iMOVds_RR(dstHi, rXhi));
+      addInstr(env, mk_iMOVds_RR(dstLo, rXlo));
+      r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond);
+      addInstr(env, ARMInstr_CmpOrTst(False/*!isCmp*/, r8,
+                                      ARMRI84_I84(0xFF,0)));
+      addInstr(env, ARMInstr_CMov(ARMcc_EQ, dstHi, ARMRI84_R(r0hi)));
+      addInstr(env, ARMInstr_CMov(ARMcc_EQ, dstLo, ARMRI84_R(r0lo)));
+      *rHi = dstHi;
+      *rLo = dstLo;
+      return;
+   }
+
+   /* It is convenient sometimes to call iselInt64Expr even when we
+      have NEON support (e.g. in do_helper_call we need 64-bit
+      arguments as 2 x 32 regs). */
+   if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+      HReg tHi = newVRegI(env);
+      HReg tLo = newVRegI(env);
+      HReg tmp = iselNeon64Expr(env, e);
+      addInstr(env, ARMInstr_VXferD(False, tmp, tHi, tLo));
+      *rHi = tHi;
+      *rLo = tLo;
+      return ;
+   }
+
+   ppIRExpr(e);
+   vpanic("iselInt64Expr");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Vector (NEON) expressions (64 or 128 bit)   ---*/
+/*---------------------------------------------------------*/
+
+static HReg iselNeon64Expr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselNeon64Expr_wrk( env, e );
+   vassert(hregClass(r) == HRcFlt64);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselNeon64Expr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env, e);
+   MatchInfo mi;
+   vassert(e);
+   vassert(ty == Ity_I64);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Const) {
+      HReg rLo, rHi;
+      HReg res = newVRegD(env);
+      iselInt64Expr(&rHi, &rLo, env, e);
+      addInstr(env, ARMInstr_VXferD(True/*toD*/, res, rHi, rLo));
+      return res;
+   }
+
+   /* 64-bit load */
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+      HReg res = newVRegD(env);
+      ARMAModeN* am = iselIntExpr_AModeN(env, e->Iex.Load.addr);
+      vassert(ty == Ity_I64);
+      addInstr(env, ARMInstr_NLdStD(True, res, am));
+      return res;
+   }
+
+   /* 64-bit GET */
+   if (e->tag == Iex_Get) {
+      HReg addr = newVRegI(env);
+      HReg res = newVRegD(env);
+      vassert(ty == Ity_I64);
+      addInstr(env, ARMInstr_Add32(addr, hregARM_R8(), e->Iex.Get.offset));
+      addInstr(env, ARMInstr_NLdStD(True, res, mkARMAModeN_R(addr)));
+      return res;
+   }
+
+   /* --------- BINARY ops --------- */
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+
+         /* 32 x 32 -> 64 multiply */
+         case Iop_MullS32:
+         case Iop_MullU32: {
+            HReg rLo, rHi;
+            HReg res = newVRegD(env);
+            iselInt64Expr(&rHi, &rLo, env, e);
+            addInstr(env, ARMInstr_VXferD(True/*toD*/, res, rHi, rLo));
+            return res;
+         }
+
+         case Iop_And64: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VAND,
+                                           res, argL, argR, 4, False));
+            return res;
+         }
+         case Iop_Or64: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
+                                           res, argL, argR, 4, False));
+            return res;
+         }
+         case Iop_Xor64: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VXOR,
+                                           res, argL, argR, 4, False));
+            return res;
+         }
+
+         /* 32HLto64(e1,e2) */
+         case Iop_32HLto64: {
+            HReg rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            HReg rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            HReg res = newVRegD(env);
+            addInstr(env, ARMInstr_VXferD(True/*toD*/, res, rHi, rLo));
+            return res;
+         }
+
+         case Iop_Add8x8:
+         case Iop_Add16x4:
+         case Iop_Add32x2:
+         case Iop_Add64: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Add8x8: size = 0; break;
+               case Iop_Add16x4: size = 1; break;
+               case Iop_Add32x2: size = 2; break;
+               case Iop_Add64: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VADD,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Add32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VADDFP,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Recps32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VRECPS,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Rsqrts32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VRSQRTS,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_InterleaveOddLanes8x8:
+         case Iop_InterleaveOddLanes16x4:
+         case Iop_InterleaveLO32x2:
+         case Iop_InterleaveEvenLanes8x8:
+         case Iop_InterleaveEvenLanes16x4:
+         case Iop_InterleaveHI32x2: {
+            HReg tmp = newVRegD(env);
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            UInt is_lo;
+            switch (e->Iex.Binop.op) {
+               case Iop_InterleaveOddLanes8x8: is_lo = 1; size = 0; break;
+               case Iop_InterleaveEvenLanes8x8: is_lo = 0; size = 0; break;
+               case Iop_InterleaveOddLanes16x4: is_lo = 1; size = 1; break;
+               case Iop_InterleaveEvenLanes16x4: is_lo = 0; size = 1; break;
+               case Iop_InterleaveLO32x2: is_lo = 1; size = 2; break;
+               case Iop_InterleaveHI32x2: is_lo = 0; size = 2; break;
+               default: vassert(0);
+            }
+            if (is_lo) {
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             tmp, argL, 4, False));
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             res, argR, 4, False));
+               addInstr(env, ARMInstr_NDual(ARMneon_TRN,
+                                            res, tmp, size, False));
+            } else {
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             tmp, argR, 4, False));
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             res, argL, 4, False));
+               addInstr(env, ARMInstr_NDual(ARMneon_TRN,
+                                            tmp, res, size, False));
+            }
+            return res;
+         }
+         case Iop_InterleaveHI8x8:
+         case Iop_InterleaveHI16x4:
+         case Iop_InterleaveLO8x8:
+         case Iop_InterleaveLO16x4: {
+            HReg tmp = newVRegD(env);
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            UInt is_lo;
+            switch (e->Iex.Binop.op) {
+               case Iop_InterleaveHI8x8: is_lo = 1; size = 0; break;
+               case Iop_InterleaveLO8x8: is_lo = 0; size = 0; break;
+               case Iop_InterleaveHI16x4: is_lo = 1; size = 1; break;
+               case Iop_InterleaveLO16x4: is_lo = 0; size = 1; break;
+               default: vassert(0);
+            }
+            if (is_lo) {
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             tmp, argL, 4, False));
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             res, argR, 4, False));
+               addInstr(env, ARMInstr_NDual(ARMneon_ZIP,
+                                            res, tmp, size, False));
+            } else {
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             tmp, argR, 4, False));
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             res, argL, 4, False));
+               addInstr(env, ARMInstr_NDual(ARMneon_ZIP,
+                                            tmp, res, size, False));
+            }
+            return res;
+         }
+         case Iop_CatOddLanes8x8:
+         case Iop_CatOddLanes16x4:
+         case Iop_CatEvenLanes8x8:
+         case Iop_CatEvenLanes16x4: {
+            HReg tmp = newVRegD(env);
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            UInt is_lo;
+            switch (e->Iex.Binop.op) {
+               case Iop_CatOddLanes8x8: is_lo = 1; size = 0; break;
+               case Iop_CatEvenLanes8x8: is_lo = 0; size = 0; break;
+               case Iop_CatOddLanes16x4: is_lo = 1; size = 1; break;
+               case Iop_CatEvenLanes16x4: is_lo = 0; size = 1; break;
+               default: vassert(0);
+            }
+            if (is_lo) {
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             tmp, argL, 4, False));
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             res, argR, 4, False));
+               addInstr(env, ARMInstr_NDual(ARMneon_UZP,
+                                            res, tmp, size, False));
+            } else {
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             tmp, argR, 4, False));
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             res, argL, 4, False));
+               addInstr(env, ARMInstr_NDual(ARMneon_UZP,
+                                            tmp, res, size, False));
+            }
+            return res;
+         }
+         case Iop_QAdd8Ux8:
+         case Iop_QAdd16Ux4:
+         case Iop_QAdd32Ux2:
+         case Iop_QAdd64Ux1: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_QAdd8Ux8: size = 0; break;
+               case Iop_QAdd16Ux4: size = 1; break;
+               case Iop_QAdd32Ux2: size = 2; break;
+               case Iop_QAdd64Ux1: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQADDU,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_QAdd8Sx8:
+         case Iop_QAdd16Sx4:
+         case Iop_QAdd32Sx2:
+         case Iop_QAdd64Sx1: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_QAdd8Sx8: size = 0; break;
+               case Iop_QAdd16Sx4: size = 1; break;
+               case Iop_QAdd32Sx2: size = 2; break;
+               case Iop_QAdd64Sx1: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQADDS,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Sub8x8:
+         case Iop_Sub16x4:
+         case Iop_Sub32x2:
+         case Iop_Sub64: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Sub8x8: size = 0; break;
+               case Iop_Sub16x4: size = 1; break;
+               case Iop_Sub32x2: size = 2; break;
+               case Iop_Sub64: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Sub32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VSUBFP,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_QSub8Ux8:
+         case Iop_QSub16Ux4:
+         case Iop_QSub32Ux2:
+         case Iop_QSub64Ux1: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_QSub8Ux8: size = 0; break;
+               case Iop_QSub16Ux4: size = 1; break;
+               case Iop_QSub32Ux2: size = 2; break;
+               case Iop_QSub64Ux1: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQSUBU,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_QSub8Sx8:
+         case Iop_QSub16Sx4:
+         case Iop_QSub32Sx2:
+         case Iop_QSub64Sx1: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_QSub8Sx8: size = 0; break;
+               case Iop_QSub16Sx4: size = 1; break;
+               case Iop_QSub32Sx2: size = 2; break;
+               case Iop_QSub64Sx1: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQSUBS,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Max8Ux8:
+         case Iop_Max16Ux4:
+         case Iop_Max32Ux2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Max8Ux8: size = 0; break;
+               case Iop_Max16Ux4: size = 1; break;
+               case Iop_Max32Ux2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMAXU,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Max8Sx8:
+         case Iop_Max16Sx4:
+         case Iop_Max32Sx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Max8Sx8: size = 0; break;
+               case Iop_Max16Sx4: size = 1; break;
+               case Iop_Max32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMAXS,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Min8Ux8:
+         case Iop_Min16Ux4:
+         case Iop_Min32Ux2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Min8Ux8: size = 0; break;
+               case Iop_Min16Ux4: size = 1; break;
+               case Iop_Min32Ux2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMINU,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Min8Sx8:
+         case Iop_Min16Sx4:
+         case Iop_Min32Sx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Min8Sx8: size = 0; break;
+               case Iop_Min16Sx4: size = 1; break;
+               case Iop_Min32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMINS,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Sar8x8:
+         case Iop_Sar16x4:
+         case Iop_Sar32x2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            HReg argR2 = newVRegD(env);
+            HReg zero = newVRegD(env);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Sar8x8: size = 0; break;
+               case Iop_Sar16x4: size = 1; break;
+               case Iop_Sar32x2: size = 2; break;
+               case Iop_Sar64: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NeonImm(zero, ARMNImm_TI(0,0)));
+            addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+                                           argR2, zero, argR, size, False));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSAL,
+                                          res, argL, argR2, size, False));
+            return res;
+         }
+         case Iop_Sal8x8:
+         case Iop_Sal16x4:
+         case Iop_Sal32x2:
+         case Iop_Sal64x1: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Sal8x8: size = 0; break;
+               case Iop_Sal16x4: size = 1; break;
+               case Iop_Sal32x2: size = 2; break;
+               case Iop_Sal64x1: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NShift(ARMneon_VSAL,
+                                          res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Shr8x8:
+         case Iop_Shr16x4:
+         case Iop_Shr32x2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            HReg argR2 = newVRegD(env);
+            HReg zero = newVRegD(env);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Shr8x8: size = 0; break;
+               case Iop_Shr16x4: size = 1; break;
+               case Iop_Shr32x2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NeonImm(zero, ARMNImm_TI(0,0)));
+            addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+                                           argR2, zero, argR, size, False));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+                                          res, argL, argR2, size, False));
+            return res;
+         }
+         case Iop_Shl8x8:
+         case Iop_Shl16x4:
+         case Iop_Shl32x2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Shl8x8: size = 0; break;
+               case Iop_Shl16x4: size = 1; break;
+               case Iop_Shl32x2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+                                          res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_QShl8x8:
+         case Iop_QShl16x4:
+         case Iop_QShl32x2:
+         case Iop_QShl64x1: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_QShl8x8: size = 0; break;
+               case Iop_QShl16x4: size = 1; break;
+               case Iop_QShl32x2: size = 2; break;
+               case Iop_QShl64x1: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NShift(ARMneon_VQSHL,
+                                          res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_QSal8x8:
+         case Iop_QSal16x4:
+         case Iop_QSal32x2:
+         case Iop_QSal64x1: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_QSal8x8: size = 0; break;
+               case Iop_QSal16x4: size = 1; break;
+               case Iop_QSal32x2: size = 2; break;
+               case Iop_QSal64x1: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NShift(ARMneon_VQSAL,
+                                          res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_QShlN8x8:
+         case Iop_QShlN16x4:
+         case Iop_QShlN32x2:
+         case Iop_QShlN64x1: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            UInt size, imm;
+            if (e->Iex.Binop.arg2->tag != Iex_Const ||
+                typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+               vpanic("ARM taget supports Iop_QShlNAxB with constant "
+                      "second argument only\n");
+            }
+            imm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+            switch (e->Iex.Binop.op) {
+               case Iop_QShlN8x8: size = 8 | imm; break;
+               case Iop_QShlN16x4: size = 16 | imm; break;
+               case Iop_QShlN32x2: size = 32 | imm; break;
+               case Iop_QShlN64x1: size = 64 | imm; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_VQSHLNUU,
+                                          res, argL, size, False));
+            return res;
+         }
+         case Iop_QShlN8Sx8:
+         case Iop_QShlN16Sx4:
+         case Iop_QShlN32Sx2:
+         case Iop_QShlN64Sx1: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            UInt size, imm;
+            if (e->Iex.Binop.arg2->tag != Iex_Const ||
+                typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+               vpanic("ARM taget supports Iop_QShlNAxB with constant "
+                      "second argument only\n");
+            }
+            imm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+            switch (e->Iex.Binop.op) {
+               case Iop_QShlN8Sx8: size = 8 | imm; break;
+               case Iop_QShlN16Sx4: size = 16 | imm; break;
+               case Iop_QShlN32Sx2: size = 32 | imm; break;
+               case Iop_QShlN64Sx1: size = 64 | imm; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_VQSHLNUS,
+                                          res, argL, size, False));
+            return res;
+         }
+         case Iop_QSalN8x8:
+         case Iop_QSalN16x4:
+         case Iop_QSalN32x2:
+         case Iop_QSalN64x1: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            UInt size, imm;
+            if (e->Iex.Binop.arg2->tag != Iex_Const ||
+                typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+               vpanic("ARM taget supports Iop_QShlNAxB with constant "
+                      "second argument only\n");
+            }
+            imm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+            switch (e->Iex.Binop.op) {
+               case Iop_QSalN8x8: size = 8 | imm; break;
+               case Iop_QSalN16x4: size = 16 | imm; break;
+               case Iop_QSalN32x2: size = 32 | imm; break;
+               case Iop_QSalN64x1: size = 64 | imm; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_VQSHLNSS,
+                                          res, argL, size, False));
+            return res;
+         }
+         case Iop_ShrN8x8:
+         case Iop_ShrN16x4:
+         case Iop_ShrN32x2:
+         case Iop_Shr64: {
+            HReg res = newVRegD(env);
+            HReg tmp = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            HReg argR2 = newVRegI(env);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_ShrN8x8: size = 0; break;
+               case Iop_ShrN16x4: size = 1; break;
+               case Iop_ShrN32x2: size = 2; break;
+               case Iop_Shr64: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_Unary(ARMun_NEG, argR2, argR));
+            addInstr(env, ARMInstr_NUnary(ARMneon_DUP, tmp, argR2, 0, False));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+                                          res, argL, tmp, size, False));
+            return res;
+         }
+         case Iop_ShlN8x8:
+         case Iop_ShlN16x4:
+         case Iop_ShlN32x2:
+         case Iop_Shl64: {
+            HReg res = newVRegD(env);
+            HReg tmp = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_ShlN8x8: size = 0; break;
+               case Iop_ShlN16x4: size = 1; break;
+               case Iop_ShlN32x2: size = 2; break;
+               case Iop_Shl64: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_DUP, tmp, argR, 0, False));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+                                          res, argL, tmp, size, False));
+            return res;
+         }
+         case Iop_SarN8x8:
+         case Iop_SarN16x4:
+         case Iop_SarN32x2:
+         case Iop_Sar64: {
+            HReg res = newVRegD(env);
+            HReg tmp = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            HReg argR2 = newVRegI(env);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_SarN8x8: size = 0; break;
+               case Iop_SarN16x4: size = 1; break;
+               case Iop_SarN32x2: size = 2; break;
+               case Iop_Sar64: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_Unary(ARMun_NEG, argR2, argR));
+            addInstr(env, ARMInstr_NUnary(ARMneon_DUP, tmp, argR2, 0, False));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSAL,
+                                          res, argL, tmp, size, False));
+            return res;
+         }
+         case Iop_CmpGT8Ux8:
+         case Iop_CmpGT16Ux4:
+         case Iop_CmpGT32Ux2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_CmpGT8Ux8: size = 0; break;
+               case Iop_CmpGT16Ux4: size = 1; break;
+               case Iop_CmpGT32Ux2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VCGTU,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_CmpGT8Sx8:
+         case Iop_CmpGT16Sx4:
+         case Iop_CmpGT32Sx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_CmpGT8Sx8: size = 0; break;
+               case Iop_CmpGT16Sx4: size = 1; break;
+               case Iop_CmpGT32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VCGTS,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_CmpEQ8x8:
+         case Iop_CmpEQ16x4:
+         case Iop_CmpEQ32x2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_CmpEQ8x8: size = 0; break;
+               case Iop_CmpEQ16x4: size = 1; break;
+               case Iop_CmpEQ32x2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VCEQ,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Mul8x8:
+         case Iop_Mul16x4:
+         case Iop_Mul32x2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Mul8x8: size = 0; break;
+               case Iop_Mul16x4: size = 1; break;
+               case Iop_Mul32x2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMUL,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Mul32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMULFP,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_QDMulHi16Sx4:
+         case Iop_QDMulHi32Sx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_QDMulHi16Sx4: size = 1; break;
+               case Iop_QDMulHi32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQDMULH,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+
+         case Iop_QRDMulHi16Sx4:
+         case Iop_QRDMulHi32Sx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_QRDMulHi16Sx4: size = 1; break;
+               case Iop_QRDMulHi32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQRDMULH,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+
+         case Iop_PwAdd8x8:
+         case Iop_PwAdd16x4:
+         case Iop_PwAdd32x2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_PwAdd8x8: size = 0; break;
+               case Iop_PwAdd16x4: size = 1; break;
+               case Iop_PwAdd32x2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VPADD,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_PwAdd32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VPADDFP,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_PwMin8Ux8:
+         case Iop_PwMin16Ux4:
+         case Iop_PwMin32Ux2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_PwMin8Ux8: size = 0; break;
+               case Iop_PwMin16Ux4: size = 1; break;
+               case Iop_PwMin32Ux2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VPMINU,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_PwMin8Sx8:
+         case Iop_PwMin16Sx4:
+         case Iop_PwMin32Sx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_PwMin8Sx8: size = 0; break;
+               case Iop_PwMin16Sx4: size = 1; break;
+               case Iop_PwMin32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VPMINS,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_PwMax8Ux8:
+         case Iop_PwMax16Ux4:
+         case Iop_PwMax32Ux2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_PwMax8Ux8: size = 0; break;
+               case Iop_PwMax16Ux4: size = 1; break;
+               case Iop_PwMax32Ux2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VPMAXU,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_PwMax8Sx8:
+         case Iop_PwMax16Sx4:
+         case Iop_PwMax32Sx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_PwMax8Sx8: size = 0; break;
+               case Iop_PwMax16Sx4: size = 1; break;
+               case Iop_PwMax32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VPMAXS,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Perm8x8: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VTBL,
+                                           res, argL, argR, 0, False));
+            return res;
+         }
+         case Iop_PolynomialMul8x8: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMULP,
+                                           res, argL, argR, size, False));
+            return res;
+         }
+         case Iop_Max32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMAXF,
+                                           res, argL, argR, 2, False));
+            return res;
+         }
+         case Iop_Min32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMINF,
+                                           res, argL, argR, 2, False));
+            return res;
+         }
+         case Iop_PwMax32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VPMAXF,
+                                           res, argL, argR, 2, False));
+            return res;
+         }
+         case Iop_PwMin32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VPMINF,
+                                           res, argL, argR, 2, False));
+            return res;
+         }
+         case Iop_CmpGT32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VCGTF,
+                                           res, argL, argR, 2, False));
+            return res;
+         }
+         case Iop_CmpGE32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VCGEF,
+                                           res, argL, argR, 2, False));
+            return res;
+         }
+         case Iop_CmpEQ32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VCEQF,
+                                           res, argL, argR, 2, False));
+            return res;
+         }
+         case Iop_F32ToFixed32Ux2_RZ:
+         case Iop_F32ToFixed32Sx2_RZ:
+         case Iop_Fixed32UToF32x2_RN:
+         case Iop_Fixed32SToF32x2_RN: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            ARMNeonUnOp op;
+            UInt imm6;
+            if (e->Iex.Binop.arg2->tag != Iex_Const ||
+               typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+                  vpanic("ARM supports FP <-> Fixed conversion with constant "
+                         "second argument less than 33 only\n");
+            }
+            imm6 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+            vassert(imm6 <= 32 && imm6 > 0);
+            imm6 = 64 - imm6;
+            switch(e->Iex.Binop.op) {
+               case Iop_F32ToFixed32Ux2_RZ: op = ARMneon_VCVTFtoFixedU; break;
+               case Iop_F32ToFixed32Sx2_RZ: op = ARMneon_VCVTFtoFixedS; break;
+               case Iop_Fixed32UToF32x2_RN: op = ARMneon_VCVTFixedUtoF; break;
+               case Iop_Fixed32SToF32x2_RN: op = ARMneon_VCVTFixedStoF; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(op, res, arg, imm6, False));
+            return res;
+         }
+         /*
+         FIXME: is this here or not?
+         case Iop_VDup8x8:
+         case Iop_VDup16x4:
+         case Iop_VDup32x2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            UInt index;
+            UInt imm4;
+            UInt size = 0;
+            if (e->Iex.Binop.arg2->tag != Iex_Const ||
+               typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+                  vpanic("ARM supports Iop_VDup with constant "
+                         "second argument less than 16 only\n");
+            }
+            index = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+            switch(e->Iex.Binop.op) {
+               case Iop_VDup8x8: imm4 = (index << 1) + 1; break;
+               case Iop_VDup16x4: imm4 = (index << 2) + 2; break;
+               case Iop_VDup32x2: imm4 = (index << 3) + 4; break;
+               default: vassert(0);
+            }
+            if (imm4 >= 16) {
+               vpanic("ARM supports Iop_VDup with constant "
+                      "second argument less than 16 only\n");
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_VDUP,
+                                          res, argL, imm4, False));
+            return res;
+         }
+         */
+         default:
+            break;
+      }
+   }
+
+   /* --------- UNARY ops --------- */
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+
+         /* ReinterpF64asI64 */
+         case Iop_ReinterpF64asI64:
+         /* Left64(e) */
+         case Iop_Left64:
+         /* CmpwNEZ64(e) */
+         //case Iop_CmpwNEZ64:
+         case Iop_1Sto64: {
+            HReg rLo, rHi;
+            HReg res = newVRegD(env);
+            iselInt64Expr(&rHi, &rLo, env, e);
+            addInstr(env, ARMInstr_VXferD(True/*toD*/, res, rHi, rLo));
+            return res;
+         }
+         case Iop_Not64: {
+            DECLARE_PATTERN(p_veqz_8x8);
+            DECLARE_PATTERN(p_veqz_16x4);
+            DECLARE_PATTERN(p_veqz_32x2);
+            DECLARE_PATTERN(p_vcge_8sx8);
+            DECLARE_PATTERN(p_vcge_16sx4);
+            DECLARE_PATTERN(p_vcge_32sx2);
+            DECLARE_PATTERN(p_vcge_8ux8);
+            DECLARE_PATTERN(p_vcge_16ux4);
+            DECLARE_PATTERN(p_vcge_32ux2);
+            DEFINE_PATTERN(p_veqz_8x8,
+                  unop(Iop_Not64, unop(Iop_CmpNEZ8x8, bind(0))));
+            DEFINE_PATTERN(p_veqz_16x4,
+                  unop(Iop_Not64, unop(Iop_CmpNEZ16x4, bind(0))));
+            DEFINE_PATTERN(p_veqz_32x2,
+                  unop(Iop_Not64, unop(Iop_CmpNEZ32x2, bind(0))));
+            DEFINE_PATTERN(p_vcge_8sx8,
+                  unop(Iop_Not64, binop(Iop_CmpGT8Sx8, bind(1), bind(0))));
+            DEFINE_PATTERN(p_vcge_16sx4,
+                  unop(Iop_Not64, binop(Iop_CmpGT16Sx4, bind(1), bind(0))));
+            DEFINE_PATTERN(p_vcge_32sx2,
+                  unop(Iop_Not64, binop(Iop_CmpGT32Sx2, bind(1), bind(0))));
+            DEFINE_PATTERN(p_vcge_8ux8,
+                  unop(Iop_Not64, binop(Iop_CmpGT8Ux8, bind(1), bind(0))));
+            DEFINE_PATTERN(p_vcge_16ux4,
+                  unop(Iop_Not64, binop(Iop_CmpGT16Ux4, bind(1), bind(0))));
+            DEFINE_PATTERN(p_vcge_32ux2,
+                  unop(Iop_Not64, binop(Iop_CmpGT32Ux2, bind(1), bind(0))));
+            if (matchIRExpr(&mi, p_veqz_8x8, e)) {
+               HReg res = newVRegD(env);
+               HReg arg = iselNeon64Expr(env, mi.bindee[0]);
+               addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, res, arg, 0, False));
+               return res;
+            } else if (matchIRExpr(&mi, p_veqz_16x4, e)) {
+               HReg res = newVRegD(env);
+               HReg arg = iselNeon64Expr(env, mi.bindee[0]);
+               addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, res, arg, 1, False));
+               return res;
+            } else if (matchIRExpr(&mi, p_veqz_32x2, e)) {
+               HReg res = newVRegD(env);
+               HReg arg = iselNeon64Expr(env, mi.bindee[0]);
+               addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, res, arg, 2, False));
+               return res;
+            } else if (matchIRExpr(&mi, p_vcge_8sx8, e)) {
+               HReg res = newVRegD(env);
+               HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+               HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VCGES,
+                                              res, argL, argR, 0, False));
+               return res;
+            } else if (matchIRExpr(&mi, p_vcge_16sx4, e)) {
+               HReg res = newVRegD(env);
+               HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+               HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VCGES,
+                                              res, argL, argR, 1, False));
+               return res;
+            } else if (matchIRExpr(&mi, p_vcge_32sx2, e)) {
+               HReg res = newVRegD(env);
+               HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+               HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VCGES,
+                                              res, argL, argR, 2, False));
+               return res;
+            } else if (matchIRExpr(&mi, p_vcge_8ux8, e)) {
+               HReg res = newVRegD(env);
+               HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+               HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VCGEU,
+                                              res, argL, argR, 0, False));
+               return res;
+            } else if (matchIRExpr(&mi, p_vcge_16ux4, e)) {
+               HReg res = newVRegD(env);
+               HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+               HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VCGEU,
+                                              res, argL, argR, 1, False));
+               return res;
+            } else if (matchIRExpr(&mi, p_vcge_32ux2, e)) {
+               HReg res = newVRegD(env);
+               HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+               HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VCGEU,
+                                              res, argL, argR, 2, False));
+               return res;
+            } else {
+               HReg res = newVRegD(env);
+               HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+               addInstr(env, ARMInstr_NUnary(ARMneon_NOT, res, arg, 4, False));
+               return res;
+            }
+         }
+         case Iop_Dup8x8:
+         case Iop_Dup16x4:
+         case Iop_Dup32x2: {
+            HReg res, arg;
+            UInt size;
+            DECLARE_PATTERN(p_vdup_8x8);
+            DECLARE_PATTERN(p_vdup_16x4);
+            DECLARE_PATTERN(p_vdup_32x2);
+            DEFINE_PATTERN(p_vdup_8x8,
+                  unop(Iop_Dup8x8, binop(Iop_GetElem8x8, bind(0), bind(1))));
+            DEFINE_PATTERN(p_vdup_16x4,
+                  unop(Iop_Dup16x4, binop(Iop_GetElem16x4, bind(0), bind(1))));
+            DEFINE_PATTERN(p_vdup_32x2,
+                  unop(Iop_Dup32x2, binop(Iop_GetElem32x2, bind(0), bind(1))));
+            if (matchIRExpr(&mi, p_vdup_8x8, e)) {
+               UInt index;
+               UInt imm4;
+               if (mi.bindee[1]->tag == Iex_Const &&
+                  typeOfIRExpr(env->type_env, mi.bindee[1]) == Ity_I8) {
+                  index = mi.bindee[1]->Iex.Const.con->Ico.U8;
+                  imm4 = (index << 1) + 1;
+                  if (index < 8) {
+                     res = newVRegD(env);
+                     arg = iselNeon64Expr(env, mi.bindee[0]);
+                     addInstr(env, ARMInstr_NUnaryS(
+                                      ARMneon_VDUP,
+                                      mkARMNRS(ARMNRS_Reg, res, 0),
+                                      mkARMNRS(ARMNRS_Scalar, arg, index),
+                                      imm4, False
+                             ));
+                     return res;
+                  }
+               }
+            } else if (matchIRExpr(&mi, p_vdup_16x4, e)) {
+               UInt index;
+               UInt imm4;
+               if (mi.bindee[1]->tag == Iex_Const &&
+                  typeOfIRExpr(env->type_env, mi.bindee[1]) == Ity_I8) {
+                  index = mi.bindee[1]->Iex.Const.con->Ico.U8;
+                  imm4 = (index << 2) + 2;
+                  if (index < 4) {
+                     res = newVRegD(env);
+                     arg = iselNeon64Expr(env, mi.bindee[0]);
+                     addInstr(env, ARMInstr_NUnaryS(
+                                      ARMneon_VDUP,
+                                      mkARMNRS(ARMNRS_Reg, res, 0),
+                                      mkARMNRS(ARMNRS_Scalar, arg, index),
+                                      imm4, False
+                             ));
+                     return res;
+                  }
+               }
+            } else if (matchIRExpr(&mi, p_vdup_32x2, e)) {
+               UInt index;
+               UInt imm4;
+               if (mi.bindee[1]->tag == Iex_Const &&
+                  typeOfIRExpr(env->type_env, mi.bindee[1]) == Ity_I8) {
+                  index = mi.bindee[1]->Iex.Const.con->Ico.U8;
+                  imm4 = (index << 3) + 4;
+                  if (index < 2) {
+                     res = newVRegD(env);
+                     arg = iselNeon64Expr(env, mi.bindee[0]);
+                     addInstr(env, ARMInstr_NUnaryS(
+                                      ARMneon_VDUP,
+                                      mkARMNRS(ARMNRS_Reg, res, 0),
+                                      mkARMNRS(ARMNRS_Scalar, arg, index),
+                                      imm4, False
+                             ));
+                     return res;
+                  }
+               }
+            }
+            arg = iselIntExpr_R(env, e->Iex.Unop.arg);
+            res = newVRegD(env);
+            switch (e->Iex.Unop.op) {
+               case Iop_Dup8x8: size = 0; break;
+               case Iop_Dup16x4: size = 1; break;
+               case Iop_Dup32x2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_DUP, res, arg, size, False));
+            return res;
+         }
+         case Iop_Abs8x8:
+         case Iop_Abs16x4:
+         case Iop_Abs32x2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Abs8x8: size = 0; break;
+               case Iop_Abs16x4: size = 1; break;
+               case Iop_Abs32x2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_ABS, res, arg, size, False));
+            return res;
+         }
+         case Iop_Reverse64_8x8:
+         case Iop_Reverse64_16x4:
+         case Iop_Reverse64_32x2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Reverse64_8x8: size = 0; break;
+               case Iop_Reverse64_16x4: size = 1; break;
+               case Iop_Reverse64_32x2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_REV64,
+                                          res, arg, size, False));
+            return res;
+         }
+         case Iop_Reverse32_8x8:
+         case Iop_Reverse32_16x4: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Reverse32_8x8: size = 0; break;
+               case Iop_Reverse32_16x4: size = 1; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_REV32,
+                                          res, arg, size, False));
+            return res;
+         }
+         case Iop_Reverse16_8x8: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NUnary(ARMneon_REV16,
+                                          res, arg, size, False));
+            return res;
+         }
+         case Iop_CmpwNEZ64: {
+            HReg x_lsh = newVRegD(env);
+            HReg x_rsh = newVRegD(env);
+            HReg lsh_amt = newVRegD(env);
+            HReg rsh_amt = newVRegD(env);
+            HReg zero = newVRegD(env);
+            HReg tmp = newVRegD(env);
+            HReg tmp2 = newVRegD(env);
+            HReg res = newVRegD(env);
+            HReg x = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, tmp2, arg, 2, False));
+            addInstr(env, ARMInstr_NUnary(ARMneon_NOT, x, tmp2, 4, False));
+            addInstr(env, ARMInstr_NeonImm(lsh_amt, ARMNImm_TI(0, 32)));
+            addInstr(env, ARMInstr_NeonImm(zero, ARMNImm_TI(0, 0)));
+            addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+                                           rsh_amt, zero, lsh_amt, 2, False));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+                                          x_lsh, x, lsh_amt, 3, False));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+                                          x_rsh, x, rsh_amt, 3, False));
+            addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
+                                           tmp, x_lsh, x_rsh, 0, False));
+            addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
+                                           res, tmp, x, 0, False));
+            return res;
+         }
+         case Iop_CmpNEZ8x8:
+         case Iop_CmpNEZ16x4:
+         case Iop_CmpNEZ32x2: {
+            HReg res = newVRegD(env);
+            HReg tmp = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            UInt size;
+            switch (e->Iex.Unop.op) {
+               case Iop_CmpNEZ8x8: size = 0; break;
+               case Iop_CmpNEZ16x4: size = 1; break;
+               case Iop_CmpNEZ32x2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, tmp, arg, size, False));
+            addInstr(env, ARMInstr_NUnary(ARMneon_NOT, res, tmp, 4, False));
+            return res;
+         }
+         case Iop_Shorten16x8:
+         case Iop_Shorten32x4:
+         case Iop_Shorten64x2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Shorten16x8: size = 0; break;
+               case Iop_Shorten32x4: size = 1; break;
+               case Iop_Shorten64x2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPYN,
+                                          res, arg, size, False));
+            return res;
+         }
+         case Iop_QShortenS16Sx8:
+         case Iop_QShortenS32Sx4:
+         case Iop_QShortenS64Sx2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_QShortenS16Sx8: size = 0; break;
+               case Iop_QShortenS32Sx4: size = 1; break;
+               case Iop_QShortenS64Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPYQNSS,
+                                          res, arg, size, False));
+            return res;
+         }
+         case Iop_QShortenU16Sx8:
+         case Iop_QShortenU32Sx4:
+         case Iop_QShortenU64Sx2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_QShortenU16Sx8: size = 0; break;
+               case Iop_QShortenU32Sx4: size = 1; break;
+               case Iop_QShortenU64Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPYQNUS,
+                                          res, arg, size, False));
+            return res;
+         }
+         case Iop_QShortenU16Ux8:
+         case Iop_QShortenU32Ux4:
+         case Iop_QShortenU64Ux2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_QShortenU16Ux8: size = 0; break;
+               case Iop_QShortenU32Ux4: size = 1; break;
+               case Iop_QShortenU64Ux2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPYQNUU,
+                                          res, arg, size, False));
+            return res;
+         }
+         case Iop_PwAddL8Sx8:
+         case Iop_PwAddL16Sx4:
+         case Iop_PwAddL32Sx2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_PwAddL8Sx8: size = 0; break;
+               case Iop_PwAddL16Sx4: size = 1; break;
+               case Iop_PwAddL32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_PADDLS,
+                                          res, arg, size, False));
+            return res;
+         }
+         case Iop_PwAddL8Ux8:
+         case Iop_PwAddL16Ux4:
+         case Iop_PwAddL32Ux2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_PwAddL8Ux8: size = 0; break;
+               case Iop_PwAddL16Ux4: size = 1; break;
+               case Iop_PwAddL32Ux2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_PADDLU,
+                                          res, arg, size, False));
+            return res;
+         }
+         case Iop_Cnt8x8: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NUnary(ARMneon_CNT,
+                                          res, arg, size, False));
+            return res;
+         }
+         case Iop_Clz8Sx8:
+         case Iop_Clz16Sx4:
+         case Iop_Clz32Sx2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Clz8Sx8: size = 0; break;
+               case Iop_Clz16Sx4: size = 1; break;
+               case Iop_Clz32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_CLZ,
+                                          res, arg, size, False));
+            return res;
+         }
+         case Iop_Cls8Sx8:
+         case Iop_Cls16Sx4:
+         case Iop_Cls32Sx2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Cls8Sx8: size = 0; break;
+               case Iop_Cls16Sx4: size = 1; break;
+               case Iop_Cls32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_CLS,
+                                          res, arg, size, False));
+            return res;
+         }
+         case Iop_FtoI32Sx2_RZ: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VCVTFtoS,
+                                          res, arg, 2, False));
+            return res;
+         }
+         case Iop_FtoI32Ux2_RZ: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VCVTFtoU,
+                                          res, arg, 2, False));
+            return res;
+         }
+         case Iop_I32StoFx2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VCVTStoF,
+                                          res, arg, 2, False));
+            return res;
+         }
+         case Iop_I32UtoFx2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VCVTUtoF,
+                                          res, arg, 2, False));
+            return res;
+         }
+         case Iop_F32toF16x4: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VCVTF32toF16,
+                                          res, arg, 2, False));
+            return res;
+         }
+         case Iop_Recip32Fx2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VRECIPF,
+                                          res, argL, 0, False));
+            return res;
+         }
+         case Iop_Recip32x2: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VRECIP,
+                                          res, argL, 0, False));
+            return res;
+         }
+         case Iop_Abs32Fx2: {
+            DECLARE_PATTERN(p_vabd_32fx2);
+            DEFINE_PATTERN(p_vabd_32fx2,
+                           unop(Iop_Abs32Fx2,
+                                binop(Iop_Sub32Fx2,
+                                      bind(0),
+                                      bind(1))));
+            if (matchIRExpr(&mi, p_vabd_32fx2, e)) {
+               HReg res = newVRegD(env);
+               HReg argL = iselNeon64Expr(env, mi.bindee[0]);
+               HReg argR = iselNeon64Expr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VABDFP,
+                                              res, argL, argR, 0, False));
+               return res;
+            } else {
+               HReg res = newVRegD(env);
+               HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+               addInstr(env, ARMInstr_NUnary(ARMneon_VABSFP,
+                                             res, arg, 0, False));
+               return res;
+            }
+         }
+         case Iop_Rsqrte32Fx2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VRSQRTEFP,
+                                          res, arg, 0, False));
+            return res;
+         }
+         case Iop_Rsqrte32x2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VRSQRTE,
+                                          res, arg, 0, False));
+            return res;
+         }
+         case Iop_Neg32Fx2: {
+            HReg res = newVRegD(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VNEGF,
+                                          res, arg, 0, False));
+            return res;
+         }
+         default:
+            break;
+      }
+   } /* if (e->tag == Iex_Unop) */
+
+   if (e->tag == Iex_Triop) {
+      switch (e->Iex.Triop.op) {
+         case Iop_Extract64: {
+            HReg res = newVRegD(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Triop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Triop.arg2);
+            UInt imm4;
+            if (e->Iex.Triop.arg3->tag != Iex_Const ||
+                typeOfIRExpr(env->type_env, e->Iex.Triop.arg3) != Ity_I8) {
+               vpanic("ARM target supports Iop_Extract64 with constant "
+                      "third argument less than 16 only\n");
+            }
+            imm4 = e->Iex.Triop.arg3->Iex.Const.con->Ico.U8;
+            if (imm4 >= 8) {
+               vpanic("ARM target supports Iop_Extract64 with constant "
+                      "third argument less than 16 only\n");
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VEXT,
+                                           res, argL, argR, imm4, False));
+            return res;
+         }
+         case Iop_SetElem8x8:
+         case Iop_SetElem16x4:
+         case Iop_SetElem32x2: {
+            HReg res = newVRegD(env);
+            HReg dreg = iselNeon64Expr(env, e->Iex.Triop.arg1);
+            HReg arg = iselIntExpr_R(env, e->Iex.Triop.arg3);
+            UInt index, size;
+            if (e->Iex.Triop.arg2->tag != Iex_Const ||
+                typeOfIRExpr(env->type_env, e->Iex.Triop.arg2) != Ity_I8) {
+               vpanic("ARM target supports SetElem with constant "
+                      "second argument only\n");
+            }
+            index = e->Iex.Triop.arg2->Iex.Const.con->Ico.U8;
+            switch (e->Iex.Triop.op) {
+               case Iop_SetElem8x8: vassert(index < 8); size = 0; break;
+               case Iop_SetElem16x4: vassert(index < 4); size = 1; break;
+               case Iop_SetElem32x2: vassert(index < 2); size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, res, dreg, 4, False));
+            addInstr(env, ARMInstr_NUnaryS(ARMneon_SETELEM,
+                                           mkARMNRS(ARMNRS_Scalar, res, index),
+                                           mkARMNRS(ARMNRS_Reg, arg, 0),
+                                           size, False));
+            return res;
+         }
+         default:
+            break;
+      }
+   }
+
+   /* --------- MULTIPLEX --------- */
+   if (e->tag == Iex_Mux0X) {
+      HReg rLo, rHi;
+      HReg res = newVRegD(env);
+      iselInt64Expr(&rHi, &rLo, env, e);
+      addInstr(env, ARMInstr_VXferD(True/*toD*/, res, rHi, rLo));
+      return res;
+   }
+
+   ppIRExpr(e);
+   vpanic("iselNeon64Expr");
+}
+
+static HReg iselNeonExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselNeonExpr_wrk( env, e );
+   vassert(hregClass(r) == HRcVec128);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselNeonExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env, e);
+   MatchInfo mi;
+   vassert(e);
+   vassert(ty == Ity_V128);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Const) {
+      /* At the moment there should be no 128-bit constants in IR for ARM
+         generated during disassemble. They are represented as Iop_64HLtoV128
+         binary operation and are handled among binary ops. */
+      /* But zero can be created by valgrind internal optimizer */
+      if (e->Iex.Const.con->Ico.V128 == 0) {
+         HReg res = newVRegV(env);
+         addInstr(env, ARMInstr_NeonImm(res, ARMNImm_TI(0, 0)));
+         return res;
+      }
+      ppIRExpr(e);
+      vpanic("128-bit constant is not implemented");
+   }
+
+   if (e->tag == Iex_Load) {
+      HReg res = newVRegV(env);
+      ARMAModeN* am = iselIntExpr_AModeN(env, e->Iex.Load.addr);
+      vassert(ty == Ity_V128);
+      addInstr(env, ARMInstr_NLdStQ(True, res, am));
+      return res;
+   }
+
+   if (e->tag == Iex_Get) {
+      HReg addr = newVRegI(env);
+      HReg res = newVRegV(env);
+      vassert(ty == Ity_V128);
+      addInstr(env, ARMInstr_Add32(addr, hregARM_R8(), e->Iex.Get.offset));
+      addInstr(env, ARMInstr_NLdStQ(True, res, mkARMAModeN_R(addr)));
+      return res;
+   }
+
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+         case Iop_NotV128: {
+            DECLARE_PATTERN(p_veqz_8x16);
+            DECLARE_PATTERN(p_veqz_16x8);
+            DECLARE_PATTERN(p_veqz_32x4);
+            DECLARE_PATTERN(p_vcge_8sx16);
+            DECLARE_PATTERN(p_vcge_16sx8);
+            DECLARE_PATTERN(p_vcge_32sx4);
+            DECLARE_PATTERN(p_vcge_8ux16);
+            DECLARE_PATTERN(p_vcge_16ux8);
+            DECLARE_PATTERN(p_vcge_32ux4);
+            DEFINE_PATTERN(p_veqz_8x16,
+                  unop(Iop_NotV128, unop(Iop_CmpNEZ8x16, bind(0))));
+            DEFINE_PATTERN(p_veqz_16x8,
+                  unop(Iop_NotV128, unop(Iop_CmpNEZ16x8, bind(0))));
+            DEFINE_PATTERN(p_veqz_32x4,
+                  unop(Iop_NotV128, unop(Iop_CmpNEZ32x4, bind(0))));
+            DEFINE_PATTERN(p_vcge_8sx16,
+                  unop(Iop_NotV128, binop(Iop_CmpGT8Sx16, bind(1), bind(0))));
+            DEFINE_PATTERN(p_vcge_16sx8,
+                  unop(Iop_NotV128, binop(Iop_CmpGT16Sx8, bind(1), bind(0))));
+            DEFINE_PATTERN(p_vcge_32sx4,
+                  unop(Iop_NotV128, binop(Iop_CmpGT32Sx4, bind(1), bind(0))));
+            DEFINE_PATTERN(p_vcge_8ux16,
+                  unop(Iop_NotV128, binop(Iop_CmpGT8Ux16, bind(1), bind(0))));
+            DEFINE_PATTERN(p_vcge_16ux8,
+                  unop(Iop_NotV128, binop(Iop_CmpGT16Ux8, bind(1), bind(0))));
+            DEFINE_PATTERN(p_vcge_32ux4,
+                  unop(Iop_NotV128, binop(Iop_CmpGT32Ux4, bind(1), bind(0))));
+            if (matchIRExpr(&mi, p_veqz_8x16, e)) {
+               HReg res = newVRegV(env);
+               HReg arg = iselNeonExpr(env, mi.bindee[0]);
+               addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, res, arg, 0, True));
+               return res;
+            } else if (matchIRExpr(&mi, p_veqz_16x8, e)) {
+               HReg res = newVRegV(env);
+               HReg arg = iselNeonExpr(env, mi.bindee[0]);
+               addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, res, arg, 1, True));
+               return res;
+            } else if (matchIRExpr(&mi, p_veqz_32x4, e)) {
+               HReg res = newVRegV(env);
+               HReg arg = iselNeonExpr(env, mi.bindee[0]);
+               addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, res, arg, 2, True));
+               return res;
+            } else if (matchIRExpr(&mi, p_vcge_8sx16, e)) {
+               HReg res = newVRegV(env);
+               HReg argL = iselNeonExpr(env, mi.bindee[0]);
+               HReg argR = iselNeonExpr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VCGES,
+                                              res, argL, argR, 0, True));
+               return res;
+            } else if (matchIRExpr(&mi, p_vcge_16sx8, e)) {
+               HReg res = newVRegV(env);
+               HReg argL = iselNeonExpr(env, mi.bindee[0]);
+               HReg argR = iselNeonExpr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VCGES,
+                                              res, argL, argR, 1, True));
+               return res;
+            } else if (matchIRExpr(&mi, p_vcge_32sx4, e)) {
+               HReg res = newVRegV(env);
+               HReg argL = iselNeonExpr(env, mi.bindee[0]);
+               HReg argR = iselNeonExpr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VCGES,
+                                              res, argL, argR, 2, True));
+               return res;
+            } else if (matchIRExpr(&mi, p_vcge_8ux16, e)) {
+               HReg res = newVRegV(env);
+               HReg argL = iselNeonExpr(env, mi.bindee[0]);
+               HReg argR = iselNeonExpr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VCGEU,
+                                              res, argL, argR, 0, True));
+               return res;
+            } else if (matchIRExpr(&mi, p_vcge_16ux8, e)) {
+               HReg res = newVRegV(env);
+               HReg argL = iselNeonExpr(env, mi.bindee[0]);
+               HReg argR = iselNeonExpr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VCGEU,
+                                              res, argL, argR, 1, True));
+               return res;
+            } else if (matchIRExpr(&mi, p_vcge_32ux4, e)) {
+               HReg res = newVRegV(env);
+               HReg argL = iselNeonExpr(env, mi.bindee[0]);
+               HReg argR = iselNeonExpr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VCGEU,
+                                              res, argL, argR, 2, True));
+               return res;
+            } else {
+               HReg res = newVRegV(env);
+               HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+               addInstr(env, ARMInstr_NUnary(ARMneon_NOT, res, arg, 4, True));
+               return res;
+            }
+         }
+         case Iop_Dup8x16:
+         case Iop_Dup16x8:
+         case Iop_Dup32x4: {
+            HReg res, arg;
+            UInt size;
+            DECLARE_PATTERN(p_vdup_8x16);
+            DECLARE_PATTERN(p_vdup_16x8);
+            DECLARE_PATTERN(p_vdup_32x4);
+            DEFINE_PATTERN(p_vdup_8x16,
+                  unop(Iop_Dup8x16, binop(Iop_GetElem8x8, bind(0), bind(1))));
+            DEFINE_PATTERN(p_vdup_16x8,
+                  unop(Iop_Dup16x8, binop(Iop_GetElem16x4, bind(0), bind(1))));
+            DEFINE_PATTERN(p_vdup_32x4,
+                  unop(Iop_Dup32x4, binop(Iop_GetElem32x2, bind(0), bind(1))));
+            if (matchIRExpr(&mi, p_vdup_8x16, e)) {
+               UInt index;
+               UInt imm4;
+               if (mi.bindee[1]->tag == Iex_Const &&
+                  typeOfIRExpr(env->type_env, mi.bindee[1]) == Ity_I8) {
+                  index = mi.bindee[1]->Iex.Const.con->Ico.U8;
+                  imm4 = (index << 1) + 1;
+                  if (index < 8) {
+                     res = newVRegV(env);
+                     arg = iselNeon64Expr(env, mi.bindee[0]);
+                     addInstr(env, ARMInstr_NUnaryS(
+                                      ARMneon_VDUP,
+                                      mkARMNRS(ARMNRS_Reg, res, 0),
+                                      mkARMNRS(ARMNRS_Scalar, arg, index),
+                                      imm4, True
+                             ));
+                     return res;
+                  }
+               }
+            } else if (matchIRExpr(&mi, p_vdup_16x8, e)) {
+               UInt index;
+               UInt imm4;
+               if (mi.bindee[1]->tag == Iex_Const &&
+                  typeOfIRExpr(env->type_env, mi.bindee[1]) == Ity_I8) {
+                  index = mi.bindee[1]->Iex.Const.con->Ico.U8;
+                  imm4 = (index << 2) + 2;
+                  if (index < 4) {
+                     res = newVRegV(env);
+                     arg = iselNeon64Expr(env, mi.bindee[0]);
+                     addInstr(env, ARMInstr_NUnaryS(
+                                      ARMneon_VDUP,
+                                      mkARMNRS(ARMNRS_Reg, res, 0),
+                                      mkARMNRS(ARMNRS_Scalar, arg, index),
+                                      imm4, True
+                             ));
+                     return res;
+                  }
+               }
+            } else if (matchIRExpr(&mi, p_vdup_32x4, e)) {
+               UInt index;
+               UInt imm4;
+               if (mi.bindee[1]->tag == Iex_Const &&
+                  typeOfIRExpr(env->type_env, mi.bindee[1]) == Ity_I8) {
+                  index = mi.bindee[1]->Iex.Const.con->Ico.U8;
+                  imm4 = (index << 3) + 4;
+                  if (index < 2) {
+                     res = newVRegV(env);
+                     arg = iselNeon64Expr(env, mi.bindee[0]);
+                     addInstr(env, ARMInstr_NUnaryS(
+                                      ARMneon_VDUP,
+                                      mkARMNRS(ARMNRS_Reg, res, 0),
+                                      mkARMNRS(ARMNRS_Scalar, arg, index),
+                                      imm4, True
+                             ));
+                     return res;
+                  }
+               }
+            }
+            arg = iselIntExpr_R(env, e->Iex.Unop.arg);
+            res = newVRegV(env);
+            switch (e->Iex.Unop.op) {
+               case Iop_Dup8x16: size = 0; break;
+               case Iop_Dup16x8: size = 1; break;
+               case Iop_Dup32x4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_DUP, res, arg, size, True));
+            return res;
+         }
+         case Iop_Abs8x16:
+         case Iop_Abs16x8:
+         case Iop_Abs32x4: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Abs8x16: size = 0; break;
+               case Iop_Abs16x8: size = 1; break;
+               case Iop_Abs32x4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_ABS, res, arg, size, True));
+            return res;
+         }
+         case Iop_Reverse64_8x16:
+         case Iop_Reverse64_16x8:
+         case Iop_Reverse64_32x4: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Reverse64_8x16: size = 0; break;
+               case Iop_Reverse64_16x8: size = 1; break;
+               case Iop_Reverse64_32x4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_REV64,
+                                          res, arg, size, True));
+            return res;
+         }
+         case Iop_Reverse32_8x16:
+         case Iop_Reverse32_16x8: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Reverse32_8x16: size = 0; break;
+               case Iop_Reverse32_16x8: size = 1; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_REV32,
+                                          res, arg, size, True));
+            return res;
+         }
+         case Iop_Reverse16_8x16: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NUnary(ARMneon_REV16,
+                                          res, arg, size, True));
+            return res;
+         }
+         case Iop_CmpNEZ64x2: {
+            HReg x_lsh = newVRegV(env);
+            HReg x_rsh = newVRegV(env);
+            HReg lsh_amt = newVRegV(env);
+            HReg rsh_amt = newVRegV(env);
+            HReg zero = newVRegV(env);
+            HReg tmp = newVRegV(env);
+            HReg tmp2 = newVRegV(env);
+            HReg res = newVRegV(env);
+            HReg x = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, tmp2, arg, 2, True));
+            addInstr(env, ARMInstr_NUnary(ARMneon_NOT, x, tmp2, 4, True));
+            addInstr(env, ARMInstr_NeonImm(lsh_amt, ARMNImm_TI(0, 32)));
+            addInstr(env, ARMInstr_NeonImm(zero, ARMNImm_TI(0, 0)));
+            addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+                                           rsh_amt, zero, lsh_amt, 2, True));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+                                          x_lsh, x, lsh_amt, 3, True));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+                                          x_rsh, x, rsh_amt, 3, True));
+            addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
+                                           tmp, x_lsh, x_rsh, 0, True));
+            addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
+                                           res, tmp, x, 0, True));
+            return res;
+         }
+         case Iop_CmpNEZ8x16:
+         case Iop_CmpNEZ16x8:
+         case Iop_CmpNEZ32x4: {
+            HReg res = newVRegV(env);
+            HReg tmp = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size;
+            switch (e->Iex.Unop.op) {
+               case Iop_CmpNEZ8x16: size = 0; break;
+               case Iop_CmpNEZ16x8: size = 1; break;
+               case Iop_CmpNEZ32x4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_EQZ, tmp, arg, size, True));
+            addInstr(env, ARMInstr_NUnary(ARMneon_NOT, res, tmp, 4, True));
+            return res;
+         }
+         case Iop_Longen8Ux8:
+         case Iop_Longen16Ux4:
+         case Iop_Longen32Ux2: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            UInt size;
+            switch (e->Iex.Unop.op) {
+               case Iop_Longen8Ux8: size = 0; break;
+               case Iop_Longen16Ux4: size = 1; break;
+               case Iop_Longen32Ux2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPYLU,
+                                          res, arg, size, True));
+            return res;
+         }
+         case Iop_Longen8Sx8:
+         case Iop_Longen16Sx4:
+         case Iop_Longen32Sx2: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            UInt size;
+            switch (e->Iex.Unop.op) {
+               case Iop_Longen8Sx8: size = 0; break;
+               case Iop_Longen16Sx4: size = 1; break;
+               case Iop_Longen32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPYLS,
+                                          res, arg, size, True));
+            return res;
+         }
+         case Iop_PwAddL8Sx16:
+         case Iop_PwAddL16Sx8:
+         case Iop_PwAddL32Sx4: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_PwAddL8Sx16: size = 0; break;
+               case Iop_PwAddL16Sx8: size = 1; break;
+               case Iop_PwAddL32Sx4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_PADDLS,
+                                          res, arg, size, True));
+            return res;
+         }
+         case Iop_PwAddL8Ux16:
+         case Iop_PwAddL16Ux8:
+         case Iop_PwAddL32Ux4: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_PwAddL8Ux16: size = 0; break;
+               case Iop_PwAddL16Ux8: size = 1; break;
+               case Iop_PwAddL32Ux4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_PADDLU,
+                                          res, arg, size, True));
+            return res;
+         }
+         case Iop_Cnt8x16: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NUnary(ARMneon_CNT, res, arg, size, True));
+            return res;
+         }
+         case Iop_Clz8Sx16:
+         case Iop_Clz16Sx8:
+         case Iop_Clz32Sx4: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Clz8Sx16: size = 0; break;
+               case Iop_Clz16Sx8: size = 1; break;
+               case Iop_Clz32Sx4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_CLZ, res, arg, size, True));
+            return res;
+         }
+         case Iop_Cls8Sx16:
+         case Iop_Cls16Sx8:
+         case Iop_Cls32Sx4: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Cls8Sx16: size = 0; break;
+               case Iop_Cls16Sx8: size = 1; break;
+               case Iop_Cls32Sx4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_CLS, res, arg, size, True));
+            return res;
+         }
+         case Iop_FtoI32Sx4_RZ: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VCVTFtoS,
+                                          res, arg, 2, True));
+            return res;
+         }
+         case Iop_FtoI32Ux4_RZ: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VCVTFtoU,
+                                          res, arg, 2, True));
+            return res;
+         }
+         case Iop_I32StoFx4: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VCVTStoF,
+                                          res, arg, 2, True));
+            return res;
+         }
+         case Iop_I32UtoFx4: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VCVTUtoF,
+                                          res, arg, 2, True));
+            return res;
+         }
+         case Iop_F16toF32x4: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeon64Expr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VCVTF16toF32,
+                                          res, arg, 2, True));
+            return res;
+         }
+         case Iop_Recip32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VRECIPF,
+                                          res, argL, 0, True));
+            return res;
+         }
+         case Iop_Recip32x4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VRECIP,
+                                          res, argL, 0, True));
+            return res;
+         }
+         case Iop_Abs32Fx4: {
+            DECLARE_PATTERN(p_vabd_32fx4);
+            DEFINE_PATTERN(p_vabd_32fx4,
+                           unop(Iop_Abs32Fx4,
+                                binop(Iop_Sub32Fx4,
+                                      bind(0),
+                                      bind(1))));
+            if (matchIRExpr(&mi, p_vabd_32fx4, e)) {
+               HReg res = newVRegV(env);
+               HReg argL = iselNeonExpr(env, mi.bindee[0]);
+               HReg argR = iselNeonExpr(env, mi.bindee[1]);
+               addInstr(env, ARMInstr_NBinary(ARMneon_VABDFP,
+                                              res, argL, argR, 0, True));
+               return res;
+            } else {
+               HReg res = newVRegV(env);
+               HReg argL = iselNeonExpr(env, e->Iex.Unop.arg);
+               addInstr(env, ARMInstr_NUnary(ARMneon_VABSFP,
+                                             res, argL, 0, True));
+               return res;
+            }
+         }
+         case Iop_Rsqrte32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VRSQRTEFP,
+                                          res, argL, 0, True));
+            return res;
+         }
+         case Iop_Rsqrte32x4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VRSQRTE,
+                                          res, argL, 0, True));
+            return res;
+         }
+         case Iop_Neg32Fx4: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_NUnary(ARMneon_VNEGF,
+                                          res, arg, 0, True));
+            return res;
+         }
+         /* ... */
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+         case Iop_64HLtoV128:
+            /* Try to match into single "VMOV reg, imm" instruction */
+            if (e->Iex.Binop.arg1->tag == Iex_Const &&
+                e->Iex.Binop.arg2->tag == Iex_Const &&
+                typeOfIRExpr(env->type_env, e->Iex.Binop.arg1) == Ity_I64 &&
+                typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) == Ity_I64 &&
+                e->Iex.Binop.arg1->Iex.Const.con->Ico.U64 ==
+                           e->Iex.Binop.arg2->Iex.Const.con->Ico.U64) {
+               ULong imm64 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
+               ARMNImm *imm = Imm64_to_ARMNImm(imm64);
+               if (imm) {
+                  HReg res = newVRegV(env);
+                  addInstr(env, ARMInstr_NeonImm(res, imm));
+                  return res;
+               }
+               if ((imm64 >> 32) == 0LL &&
+                   (imm = Imm64_to_ARMNImm(imm64 | (imm64 << 32))) != NULL) {
+                  HReg tmp1 = newVRegV(env);
+                  HReg tmp2 = newVRegV(env);
+                  HReg res = newVRegV(env);
+                  if (imm->type < 10) {
+                     addInstr(env, ARMInstr_NeonImm(tmp1, ARMNImm_TI(9,0x0f)));
+                     addInstr(env, ARMInstr_NeonImm(tmp2, imm));
+                     addInstr(env, ARMInstr_NBinary(ARMneon_VAND,
+                                                    res, tmp1, tmp2, 4, True));
+                     return res;
+                  }
+               }
+               if ((imm64 & 0xFFFFFFFFLL) == 0LL &&
+                   (imm = Imm64_to_ARMNImm(imm64 | (imm64 >> 32))) != NULL) {
+                  HReg tmp1 = newVRegV(env);
+                  HReg tmp2 = newVRegV(env);
+                  HReg res = newVRegV(env);
+                  if (imm->type < 10) {
+                     addInstr(env, ARMInstr_NeonImm(tmp1, ARMNImm_TI(9,0xf0)));
+                     addInstr(env, ARMInstr_NeonImm(tmp2, imm));
+                     addInstr(env, ARMInstr_NBinary(ARMneon_VAND,
+                                                    res, tmp1, tmp2, 4, True));
+                     return res;
+                  }
+               }
+            }
+            /* Does not match "VMOV Reg, Imm" form */
+            goto neon_expr_bad;
+         case Iop_AndV128: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VAND,
+                                           res, argL, argR, 4, True));
+            return res;
+         }
+         case Iop_OrV128: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VORR,
+                                           res, argL, argR, 4, True));
+            return res;
+         }
+         case Iop_XorV128: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VXOR,
+                                           res, argL, argR, 4, True));
+            return res;
+         }
+         case Iop_Add8x16:
+         case Iop_Add16x8:
+         case Iop_Add32x4:
+         case Iop_Add64x2: {
+            /* 
+            FIXME: remove this if not used
+            DECLARE_PATTERN(p_vrhadd_32sx4);
+            ULong one = (1LL << 32) | 1LL;
+            DEFINE_PATTERN(p_vrhadd_32sx4,
+                  binop(Iop_Add32x4,
+                        binop(Iop_Add32x4,
+                              binop(Iop_SarN32x4,
+                                    bind(0),
+                                    mkU8(1)),
+                              binop(Iop_SarN32x4,
+                                    bind(1),
+                                    mkU8(1))),
+                        binop(Iop_SarN32x4,
+                              binop(Iop_Add32x4,
+                                    binop(Iop_Add32x4,
+                                          binop(Iop_AndV128,
+                                                bind(0),
+                                                mkU128(one)),
+                                          binop(Iop_AndV128,
+                                                bind(1),
+                                                mkU128(one))),
+                                    mkU128(one)),
+                              mkU8(1))));
+            */
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Add8x16: size = 0; break;
+               case Iop_Add16x8: size = 1; break;
+               case Iop_Add32x4: size = 2; break;
+               case Iop_Add64x2: size = 3; break;
+               default:
+                  ppIROp(e->Iex.Binop.op);
+                  vpanic("Illegal element size in VADD");
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VADD,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Add32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VADDFP,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Recps32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VRECPS,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Rsqrts32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VRSQRTS,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_InterleaveEvenLanes8x16:
+         case Iop_InterleaveEvenLanes16x8:
+         case Iop_InterleaveEvenLanes32x4:
+         case Iop_InterleaveOddLanes8x16:
+         case Iop_InterleaveOddLanes16x8:
+         case Iop_InterleaveOddLanes32x4: {
+            HReg tmp = newVRegV(env);
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            UInt is_lo;
+            switch (e->Iex.Binop.op) {
+               case Iop_InterleaveEvenLanes8x16: is_lo = 0; size = 0; break;
+               case Iop_InterleaveOddLanes8x16: is_lo = 1; size = 0; break;
+               case Iop_InterleaveEvenLanes16x8: is_lo = 0; size = 1; break;
+               case Iop_InterleaveOddLanes16x8: is_lo = 1; size = 1; break;
+               case Iop_InterleaveEvenLanes32x4: is_lo = 0; size = 2; break;
+               case Iop_InterleaveOddLanes32x4: is_lo = 1; size = 2; break;
+               default:
+                  ppIROp(e->Iex.Binop.op);
+                  vpanic("Illegal element size in VTRN");
+            }
+            if (is_lo) {
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             tmp, argL, 4, True));
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             res, argR, 4, True));
+               addInstr(env, ARMInstr_NDual(ARMneon_TRN,
+                                            res, tmp, size, True));
+            } else {
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             tmp, argR, 4, True));
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             res, argL, 4, True));
+               addInstr(env, ARMInstr_NDual(ARMneon_TRN,
+                                            tmp, res, size, True));
+            }
+            return res;
+         }
+         case Iop_InterleaveHI8x16:
+         case Iop_InterleaveHI16x8:
+         case Iop_InterleaveHI32x4:
+         case Iop_InterleaveLO8x16:
+         case Iop_InterleaveLO16x8:
+         case Iop_InterleaveLO32x4: {
+            HReg tmp = newVRegV(env);
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            UInt is_lo;
+            switch (e->Iex.Binop.op) {
+               case Iop_InterleaveHI8x16: is_lo = 1; size = 0; break;
+               case Iop_InterleaveLO8x16: is_lo = 0; size = 0; break;
+               case Iop_InterleaveHI16x8: is_lo = 1; size = 1; break;
+               case Iop_InterleaveLO16x8: is_lo = 0; size = 1; break;
+               case Iop_InterleaveHI32x4: is_lo = 1; size = 2; break;
+               case Iop_InterleaveLO32x4: is_lo = 0; size = 2; break;
+               default:
+                  ppIROp(e->Iex.Binop.op);
+                  vpanic("Illegal element size in VZIP");
+            }
+            if (is_lo) {
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             tmp, argL, 4, True));
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             res, argR, 4, True));
+               addInstr(env, ARMInstr_NDual(ARMneon_ZIP,
+                                            res, tmp, size, True));
+            } else {
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             tmp, argR, 4, True));
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             res, argL, 4, True));
+               addInstr(env, ARMInstr_NDual(ARMneon_ZIP,
+                                            tmp, res, size, True));
+            }
+            return res;
+         }
+         case Iop_CatOddLanes8x16:
+         case Iop_CatOddLanes16x8:
+         case Iop_CatOddLanes32x4:
+         case Iop_CatEvenLanes8x16:
+         case Iop_CatEvenLanes16x8:
+         case Iop_CatEvenLanes32x4: {
+            HReg tmp = newVRegV(env);
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            UInt is_lo;
+            switch (e->Iex.Binop.op) {
+               case Iop_CatOddLanes8x16: is_lo = 1; size = 0; break;
+               case Iop_CatEvenLanes8x16: is_lo = 0; size = 0; break;
+               case Iop_CatOddLanes16x8: is_lo = 1; size = 1; break;
+               case Iop_CatEvenLanes16x8: is_lo = 0; size = 1; break;
+               case Iop_CatOddLanes32x4: is_lo = 1; size = 2; break;
+               case Iop_CatEvenLanes32x4: is_lo = 0; size = 2; break;
+               default:
+                  ppIROp(e->Iex.Binop.op);
+                  vpanic("Illegal element size in VUZP");
+            }
+            if (is_lo) {
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             tmp, argL, 4, True));
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             res, argR, 4, True));
+               addInstr(env, ARMInstr_NDual(ARMneon_UZP,
+                                            res, tmp, size, True));
+            } else {
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             tmp, argR, 4, True));
+               addInstr(env, ARMInstr_NUnary(ARMneon_COPY,
+                                             res, argL, 4, True));
+               addInstr(env, ARMInstr_NDual(ARMneon_UZP,
+                                            tmp, res, size, True));
+            }
+            return res;
+         }
+         case Iop_QAdd8Ux16:
+         case Iop_QAdd16Ux8:
+         case Iop_QAdd32Ux4:
+         case Iop_QAdd64Ux2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_QAdd8Ux16: size = 0; break;
+               case Iop_QAdd16Ux8: size = 1; break;
+               case Iop_QAdd32Ux4: size = 2; break;
+               case Iop_QAdd64Ux2: size = 3; break;
+               default:
+                  ppIROp(e->Iex.Binop.op);
+                  vpanic("Illegal element size in VQADDU");
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQADDU,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_QAdd8Sx16:
+         case Iop_QAdd16Sx8:
+         case Iop_QAdd32Sx4:
+         case Iop_QAdd64Sx2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_QAdd8Sx16: size = 0; break;
+               case Iop_QAdd16Sx8: size = 1; break;
+               case Iop_QAdd32Sx4: size = 2; break;
+               case Iop_QAdd64Sx2: size = 3; break;
+               default:
+                  ppIROp(e->Iex.Binop.op);
+                  vpanic("Illegal element size in VQADDS");
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQADDS,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Sub8x16:
+         case Iop_Sub16x8:
+         case Iop_Sub32x4:
+         case Iop_Sub64x2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Sub8x16: size = 0; break;
+               case Iop_Sub16x8: size = 1; break;
+               case Iop_Sub32x4: size = 2; break;
+               case Iop_Sub64x2: size = 3; break;
+               default:
+                  ppIROp(e->Iex.Binop.op);
+                  vpanic("Illegal element size in VSUB");
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Sub32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VSUBFP,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_QSub8Ux16:
+         case Iop_QSub16Ux8:
+         case Iop_QSub32Ux4:
+         case Iop_QSub64Ux2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_QSub8Ux16: size = 0; break;
+               case Iop_QSub16Ux8: size = 1; break;
+               case Iop_QSub32Ux4: size = 2; break;
+               case Iop_QSub64Ux2: size = 3; break;
+               default:
+                  ppIROp(e->Iex.Binop.op);
+                  vpanic("Illegal element size in VQSUBU");
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQSUBU,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_QSub8Sx16:
+         case Iop_QSub16Sx8:
+         case Iop_QSub32Sx4:
+         case Iop_QSub64Sx2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_QSub8Sx16: size = 0; break;
+               case Iop_QSub16Sx8: size = 1; break;
+               case Iop_QSub32Sx4: size = 2; break;
+               case Iop_QSub64Sx2: size = 3; break;
+               default:
+                  ppIROp(e->Iex.Binop.op);
+                  vpanic("Illegal element size in VQSUBS");
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQSUBS,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Max8Ux16:
+         case Iop_Max16Ux8:
+         case Iop_Max32Ux4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Max8Ux16: size = 0; break;
+               case Iop_Max16Ux8: size = 1; break;
+               case Iop_Max32Ux4: size = 2; break;
+               default: vpanic("Illegal element size in VMAXU");
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMAXU,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Max8Sx16:
+         case Iop_Max16Sx8:
+         case Iop_Max32Sx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Max8Sx16: size = 0; break;
+               case Iop_Max16Sx8: size = 1; break;
+               case Iop_Max32Sx4: size = 2; break;
+               default: vpanic("Illegal element size in VMAXU");
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMAXS,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Min8Ux16:
+         case Iop_Min16Ux8:
+         case Iop_Min32Ux4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Min8Ux16: size = 0; break;
+               case Iop_Min16Ux8: size = 1; break;
+               case Iop_Min32Ux4: size = 2; break;
+               default: vpanic("Illegal element size in VMAXU");
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMINU,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Min8Sx16:
+         case Iop_Min16Sx8:
+         case Iop_Min32Sx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Min8Sx16: size = 0; break;
+               case Iop_Min16Sx8: size = 1; break;
+               case Iop_Min32Sx4: size = 2; break;
+               default: vpanic("Illegal element size in VMAXU");
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMINS,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Sar8x16:
+         case Iop_Sar16x8:
+         case Iop_Sar32x4:
+         case Iop_Sar64x2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            HReg argR2 = newVRegV(env);
+            HReg zero = newVRegV(env);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Sar8x16: size = 0; break;
+               case Iop_Sar16x8: size = 1; break;
+               case Iop_Sar32x4: size = 2; break;
+               case Iop_Sar64x2: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NeonImm(zero, ARMNImm_TI(0,0)));
+            addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+                                           argR2, zero, argR, size, True));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSAL,
+                                          res, argL, argR2, size, True));
+            return res;
+         }
+         case Iop_Sal8x16:
+         case Iop_Sal16x8:
+         case Iop_Sal32x4:
+         case Iop_Sal64x2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Sal8x16: size = 0; break;
+               case Iop_Sal16x8: size = 1; break;
+               case Iop_Sal32x4: size = 2; break;
+               case Iop_Sal64x2: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NShift(ARMneon_VSAL,
+                                          res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Shr8x16:
+         case Iop_Shr16x8:
+         case Iop_Shr32x4:
+         case Iop_Shr64x2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            HReg argR2 = newVRegV(env);
+            HReg zero = newVRegV(env);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Shr8x16: size = 0; break;
+               case Iop_Shr16x8: size = 1; break;
+               case Iop_Shr32x4: size = 2; break;
+               case Iop_Shr64x2: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NeonImm(zero, ARMNImm_TI(0,0)));
+            addInstr(env, ARMInstr_NBinary(ARMneon_VSUB,
+                                           argR2, zero, argR, size, True));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+                                          res, argL, argR2, size, True));
+            return res;
+         }
+         case Iop_Shl8x16:
+         case Iop_Shl16x8:
+         case Iop_Shl32x4:
+         case Iop_Shl64x2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_Shl8x16: size = 0; break;
+               case Iop_Shl16x8: size = 1; break;
+               case Iop_Shl32x4: size = 2; break;
+               case Iop_Shl64x2: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+                                          res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_QShl8x16:
+         case Iop_QShl16x8:
+         case Iop_QShl32x4:
+         case Iop_QShl64x2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_QShl8x16: size = 0; break;
+               case Iop_QShl16x8: size = 1; break;
+               case Iop_QShl32x4: size = 2; break;
+               case Iop_QShl64x2: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NShift(ARMneon_VQSHL,
+                                          res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_QSal8x16:
+         case Iop_QSal16x8:
+         case Iop_QSal32x4:
+         case Iop_QSal64x2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_QSal8x16: size = 0; break;
+               case Iop_QSal16x8: size = 1; break;
+               case Iop_QSal32x4: size = 2; break;
+               case Iop_QSal64x2: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NShift(ARMneon_VQSAL,
+                                          res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_QShlN8x16:
+         case Iop_QShlN16x8:
+         case Iop_QShlN32x4:
+         case Iop_QShlN64x2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            UInt size, imm;
+            if (e->Iex.Binop.arg2->tag != Iex_Const ||
+                typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+               vpanic("ARM taget supports Iop_QShlNAxB with constant "
+                      "second argument only\n");
+            }
+            imm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+            switch (e->Iex.Binop.op) {
+               case Iop_QShlN8x16: size = 8 | imm; break;
+               case Iop_QShlN16x8: size = 16 | imm; break;
+               case Iop_QShlN32x4: size = 32 | imm; break;
+               case Iop_QShlN64x2: size = 64 | imm; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_VQSHLNUU,
+                                          res, argL, size, True));
+            return res;
+         }
+         case Iop_QShlN8Sx16:
+         case Iop_QShlN16Sx8:
+         case Iop_QShlN32Sx4:
+         case Iop_QShlN64Sx2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            UInt size, imm;
+            if (e->Iex.Binop.arg2->tag != Iex_Const ||
+                typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+               vpanic("ARM taget supports Iop_QShlNASxB with constant "
+                      "second argument only\n");
+            }
+            imm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+            switch (e->Iex.Binop.op) {
+               case Iop_QShlN8Sx16: size = 8 | imm; break;
+               case Iop_QShlN16Sx8: size = 16 | imm; break;
+               case Iop_QShlN32Sx4: size = 32 | imm; break;
+               case Iop_QShlN64Sx2: size = 64 | imm; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_VQSHLNUS,
+                                          res, argL, size, True));
+            return res;
+         }
+         case Iop_QSalN8x16:
+         case Iop_QSalN16x8:
+         case Iop_QSalN32x4:
+         case Iop_QSalN64x2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            UInt size, imm;
+            if (e->Iex.Binop.arg2->tag != Iex_Const ||
+                typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+               vpanic("ARM taget supports Iop_QShlNAxB with constant "
+                      "second argument only\n");
+            }
+            imm = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+            switch (e->Iex.Binop.op) {
+               case Iop_QSalN8x16: size = 8 | imm; break;
+               case Iop_QSalN16x8: size = 16 | imm; break;
+               case Iop_QSalN32x4: size = 32 | imm; break;
+               case Iop_QSalN64x2: size = 64 | imm; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_VQSHLNSS,
+                                          res, argL, size, True));
+            return res;
+         }
+         case Iop_ShrN8x16:
+         case Iop_ShrN16x8:
+         case Iop_ShrN32x4:
+         case Iop_ShrN64x2: {
+            HReg res = newVRegV(env);
+            HReg tmp = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            HReg argR2 = newVRegI(env);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_ShrN8x16: size = 0; break;
+               case Iop_ShrN16x8: size = 1; break;
+               case Iop_ShrN32x4: size = 2; break;
+               case Iop_ShrN64x2: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_Unary(ARMun_NEG, argR2, argR));
+            addInstr(env, ARMInstr_NUnary(ARMneon_DUP,
+                                          tmp, argR2, 0, True));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+                                          res, argL, tmp, size, True));
+            return res;
+         }
+         case Iop_ShlN8x16:
+         case Iop_ShlN16x8:
+         case Iop_ShlN32x4:
+         case Iop_ShlN64x2: {
+            HReg res = newVRegV(env);
+            HReg tmp = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_ShlN8x16: size = 0; break;
+               case Iop_ShlN16x8: size = 1; break;
+               case Iop_ShlN32x4: size = 2; break;
+               case Iop_ShlN64x2: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_DUP, tmp, argR, 0, True));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSHL,
+                                          res, argL, tmp, size, True));
+            return res;
+         }
+         case Iop_SarN8x16:
+         case Iop_SarN16x8:
+         case Iop_SarN32x4:
+         case Iop_SarN64x2: {
+            HReg res = newVRegV(env);
+            HReg tmp = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            HReg argR2 = newVRegI(env);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_SarN8x16: size = 0; break;
+               case Iop_SarN16x8: size = 1; break;
+               case Iop_SarN32x4: size = 2; break;
+               case Iop_SarN64x2: size = 3; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_Unary(ARMun_NEG, argR2, argR));
+            addInstr(env, ARMInstr_NUnary(ARMneon_DUP, tmp, argR2, 0, True));
+            addInstr(env, ARMInstr_NShift(ARMneon_VSAL,
+                                          res, argL, tmp, size, True));
+            return res;
+         }
+         case Iop_CmpGT8Ux16:
+         case Iop_CmpGT16Ux8:
+         case Iop_CmpGT32Ux4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_CmpGT8Ux16: size = 0; break;
+               case Iop_CmpGT16Ux8: size = 1; break;
+               case Iop_CmpGT32Ux4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VCGTU,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_CmpGT8Sx16:
+         case Iop_CmpGT16Sx8:
+         case Iop_CmpGT32Sx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_CmpGT8Sx16: size = 0; break;
+               case Iop_CmpGT16Sx8: size = 1; break;
+               case Iop_CmpGT32Sx4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VCGTS,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_CmpEQ8x16:
+         case Iop_CmpEQ16x8:
+         case Iop_CmpEQ32x4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size;
+            switch (e->Iex.Binop.op) {
+               case Iop_CmpEQ8x16: size = 0; break;
+               case Iop_CmpEQ16x8: size = 1; break;
+               case Iop_CmpEQ32x4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VCEQ,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Mul8x16:
+         case Iop_Mul16x8:
+         case Iop_Mul32x4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Mul8x16: size = 0; break;
+               case Iop_Mul16x8: size = 1; break;
+               case Iop_Mul32x4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMUL,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Mul32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMULFP,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Mull8Ux8:
+         case Iop_Mull16Ux4:
+         case Iop_Mull32Ux2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Mull8Ux8: size = 0; break;
+               case Iop_Mull16Ux4: size = 1; break;
+               case Iop_Mull32Ux2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMULLU,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+
+         case Iop_Mull8Sx8:
+         case Iop_Mull16Sx4:
+         case Iop_Mull32Sx2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_Mull8Sx8: size = 0; break;
+               case Iop_Mull16Sx4: size = 1; break;
+               case Iop_Mull32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMULLS,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+
+         case Iop_QDMulHi16Sx8:
+         case Iop_QDMulHi32Sx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_QDMulHi16Sx8: size = 1; break;
+               case Iop_QDMulHi32Sx4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQDMULH,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+
+         case Iop_QRDMulHi16Sx8:
+         case Iop_QRDMulHi32Sx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_QRDMulHi16Sx8: size = 1; break;
+               case Iop_QRDMulHi32Sx4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQRDMULH,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+
+         case Iop_QDMulLong16Sx4:
+         case Iop_QDMulLong32Sx2: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_QDMulLong16Sx4: size = 1; break;
+               case Iop_QDMulLong32Sx2: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VQDMULL,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_PolynomialMul8x16: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMULP,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_Max32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMAXF,
+                                           res, argL, argR, 2, True));
+            return res;
+         }
+         case Iop_Min32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMINF,
+                                           res, argL, argR, 2, True));
+            return res;
+         }
+         case Iop_PwMax32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VPMAXF,
+                                           res, argL, argR, 2, True));
+            return res;
+         }
+         case Iop_PwMin32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VPMINF,
+                                           res, argL, argR, 2, True));
+            return res;
+         }
+         case Iop_CmpGT32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VCGTF,
+                                           res, argL, argR, 2, True));
+            return res;
+         }
+         case Iop_CmpGE32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VCGEF,
+                                           res, argL, argR, 2, True));
+            return res;
+         }
+         case Iop_CmpEQ32Fx4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            addInstr(env, ARMInstr_NBinary(ARMneon_VCEQF,
+                                           res, argL, argR, 2, True));
+            return res;
+         }
+
+         case Iop_PolynomialMull8x8: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeon64Expr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            addInstr(env, ARMInstr_NBinary(ARMneon_VMULLP,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         case Iop_F32ToFixed32Ux4_RZ:
+         case Iop_F32ToFixed32Sx4_RZ:
+         case Iop_Fixed32UToF32x4_RN:
+         case Iop_Fixed32SToF32x4_RN: {
+            HReg res = newVRegV(env);
+            HReg arg = iselNeonExpr(env, e->Iex.Binop.arg1);
+            ARMNeonUnOp op;
+            UInt imm6;
+            if (e->Iex.Binop.arg2->tag != Iex_Const ||
+               typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+                  vpanic("ARM supports FP <-> Fixed conversion with constant "
+                         "second argument less than 33 only\n");
+            }
+            imm6 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+            vassert(imm6 <= 32 && imm6 > 0);
+            imm6 = 64 - imm6;
+            switch(e->Iex.Binop.op) {
+               case Iop_F32ToFixed32Ux4_RZ: op = ARMneon_VCVTFtoFixedU; break;
+               case Iop_F32ToFixed32Sx4_RZ: op = ARMneon_VCVTFtoFixedS; break;
+               case Iop_Fixed32UToF32x4_RN: op = ARMneon_VCVTFixedUtoF; break;
+               case Iop_Fixed32SToF32x4_RN: op = ARMneon_VCVTFixedStoF; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NUnary(op, res, arg, imm6, True));
+            return res;
+         }
+         /*
+         FIXME remove if not used
+         case Iop_VDup8x16:
+         case Iop_VDup16x8:
+         case Iop_VDup32x4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeon64Expr(env, e->Iex.Binop.arg1);
+            UInt imm4;
+            UInt index;
+            if (e->Iex.Binop.arg2->tag != Iex_Const ||
+               typeOfIRExpr(env->type_env, e->Iex.Binop.arg2) != Ity_I8) {
+                  vpanic("ARM supports Iop_VDup with constant "
+                         "second argument less than 16 only\n");
+            }
+            index = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+            switch(e->Iex.Binop.op) {
+               case Iop_VDup8x16: imm4 = (index << 1) + 1; break;
+               case Iop_VDup16x8: imm4 = (index << 2) + 2; break;
+               case Iop_VDup32x4: imm4 = (index << 3) + 4; break;
+               default: vassert(0);
+            }
+            if (imm4 >= 16) {
+               vpanic("ARM supports Iop_VDup with constant "
+                      "second argument less than 16 only\n");
+            }
+            addInstr(env, ARMInstr_NUnary(ARMneon_VDUP,
+                                          res, argL, imm4, True));
+            return res;
+         }
+         */
+         case Iop_PwAdd8x16:
+         case Iop_PwAdd16x8:
+         case Iop_PwAdd32x4: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Binop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Binop.arg2);
+            UInt size = 0;
+            switch(e->Iex.Binop.op) {
+               case Iop_PwAdd8x16: size = 0; break;
+               case Iop_PwAdd16x8: size = 1; break;
+               case Iop_PwAdd32x4: size = 2; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VPADD,
+                                           res, argL, argR, size, True));
+            return res;
+         }
+         /* ... */
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Triop) {
+      switch (e->Iex.Triop.op) {
+         case Iop_ExtractV128: {
+            HReg res = newVRegV(env);
+            HReg argL = iselNeonExpr(env, e->Iex.Triop.arg1);
+            HReg argR = iselNeonExpr(env, e->Iex.Triop.arg2);
+            UInt imm4;
+            if (e->Iex.Triop.arg3->tag != Iex_Const ||
+                typeOfIRExpr(env->type_env, e->Iex.Triop.arg3) != Ity_I8) {
+               vpanic("ARM target supports Iop_ExtractV128 with constant "
+                      "third argument less than 16 only\n");
+            }
+            imm4 = e->Iex.Triop.arg3->Iex.Const.con->Ico.U8;
+            if (imm4 >= 16) {
+               vpanic("ARM target supports Iop_ExtractV128 with constant "
+                      "third argument less than 16 only\n");
+            }
+            addInstr(env, ARMInstr_NBinary(ARMneon_VEXT,
+                                           res, argL, argR, imm4, True));
+            return res;
+         }
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Mux0X) {
+      HReg r8;
+      HReg rX  = iselNeonExpr(env, e->Iex.Mux0X.exprX);
+      HReg r0  = iselNeonExpr(env, e->Iex.Mux0X.expr0);
+      HReg dst = newVRegV(env);
+      addInstr(env, ARMInstr_NUnary(ARMneon_COPY, dst, rX, 4, True));
+      r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond);
+      addInstr(env, ARMInstr_CmpOrTst(False/*!isCmp*/, r8,
+                                      ARMRI84_I84(0xFF,0)));
+      addInstr(env, ARMInstr_NCMovQ(ARMcc_EQ, dst, r0));
+      return dst;
+   }
+
+  neon_expr_bad:
+   ppIRExpr(e);
+   vpanic("iselNeonExpr_wrk");
+}
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Floating point expressions (64 bit)         ---*/
+/*---------------------------------------------------------*/
+
+/* Compute a 64-bit floating point value into a register, the identity
+   of which is returned.  As with iselIntExpr_R, the reg may be either
+   real or virtual; in any case it must not be changed by subsequent
+   code emitted by the caller.  */
+
+static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselDblExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcFlt64);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(e);
+   vassert(ty == Ity_F64);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Const) {
+      /* Just handle the zero case. */
+      IRConst* con = e->Iex.Const.con;
+      if (con->tag == Ico_F64i && con->Ico.F64i == 0ULL) {
+         HReg z32 = newVRegI(env);
+         HReg dst = newVRegD(env);
+         addInstr(env, ARMInstr_Imm32(z32, 0));
+         addInstr(env, ARMInstr_VXferD(True/*toD*/, dst, z32, z32));
+         return dst;
+      }
+   }
+
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+      ARMAModeV* am;
+      HReg res = newVRegD(env);
+      vassert(e->Iex.Load.ty == Ity_F64);
+      am = iselIntExpr_AModeV(env, e->Iex.Load.addr);
+      addInstr(env, ARMInstr_VLdStD(True/*isLoad*/, res, am));
+      return res;
+   }
+
+   if (e->tag == Iex_Get) {
+      // XXX This won't work if offset > 1020 or is not 0 % 4.
+      // In which case we'll have to generate more longwinded code.
+      ARMAModeV* am  = mkARMAModeV(hregARM_R8(), e->Iex.Get.offset);
+      HReg       res = newVRegD(env);
+      addInstr(env, ARMInstr_VLdStD(True/*isLoad*/, res, am));
+      return res;
+   }
+
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+         case Iop_ReinterpI64asF64: {
+            if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+               return iselNeon64Expr(env, e->Iex.Unop.arg);
+            } else {
+               HReg srcHi, srcLo;
+               HReg dst = newVRegD(env);
+               iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
+               addInstr(env, ARMInstr_VXferD(True/*toD*/, dst, srcHi, srcLo));
+               return dst;
+            }
+         }
+         case Iop_NegF64: {
+            HReg src = iselDblExpr(env, e->Iex.Unop.arg);
+            HReg dst = newVRegD(env);
+            addInstr(env, ARMInstr_VUnaryD(ARMvfpu_NEG, dst, src));
+            return dst;
+         }
+         case Iop_AbsF64: {
+            HReg src = iselDblExpr(env, e->Iex.Unop.arg);
+            HReg dst = newVRegD(env);
+            addInstr(env, ARMInstr_VUnaryD(ARMvfpu_ABS, dst, src));
+            return dst;
+         }
+         case Iop_F32toF64: {
+            HReg src = iselFltExpr(env, e->Iex.Unop.arg);
+            HReg dst = newVRegD(env);
+            addInstr(env, ARMInstr_VCvtSD(True/*sToD*/, dst, src));
+            return dst;
+         }
+         case Iop_I32UtoF64:
+         case Iop_I32StoF64: {
+            HReg src   = iselIntExpr_R(env, e->Iex.Unop.arg);
+            HReg f32   = newVRegF(env);
+            HReg dst   = newVRegD(env);
+            Bool syned = e->Iex.Unop.op == Iop_I32StoF64;
+            /* VMOV f32, src */
+            addInstr(env, ARMInstr_VXferS(True/*toS*/, f32, src));
+            /* FSITOD dst, f32 */
+            addInstr(env, ARMInstr_VCvtID(True/*iToD*/, syned,
+                                          dst, f32));
+            return dst;
+         }
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+         case Iop_SqrtF64: {
+            /* first arg is rounding mode; we ignore it. */
+            HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
+            HReg dst = newVRegD(env);
+            addInstr(env, ARMInstr_VUnaryD(ARMvfpu_SQRT, dst, src));
+            return dst;
+         }
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Triop) {
+      switch (e->Iex.Triop.op) {
+         case Iop_DivF64:
+         case Iop_MulF64:
+         case Iop_AddF64:
+         case Iop_SubF64: {
+            ARMVfpOp op = 0; /*INVALID*/
+            HReg argL = iselDblExpr(env, e->Iex.Triop.arg2);
+            HReg argR = iselDblExpr(env, e->Iex.Triop.arg3);
+            HReg dst  = newVRegD(env);
+            switch (e->Iex.Triop.op) {
+               case Iop_DivF64: op = ARMvfp_DIV; break;
+               case Iop_MulF64: op = ARMvfp_MUL; break;
+               case Iop_AddF64: op = ARMvfp_ADD; break;
+               case Iop_SubF64: op = ARMvfp_SUB; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_VAluD(op, dst, argL, argR));
+            return dst;
+         }
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Mux0X) {
+      if (ty == Ity_F64
+          && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
+         HReg r8;
+         HReg rX  = iselDblExpr(env, e->Iex.Mux0X.exprX);
+         HReg r0  = iselDblExpr(env, e->Iex.Mux0X.expr0);
+         HReg dst = newVRegD(env);
+         addInstr(env, ARMInstr_VUnaryD(ARMvfpu_COPY, dst, rX));
+         r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond);
+         addInstr(env, ARMInstr_CmpOrTst(False/*!isCmp*/, r8,
+                                         ARMRI84_I84(0xFF,0)));
+         addInstr(env, ARMInstr_VCMovD(ARMcc_EQ, dst, r0));
+         return dst;
+      }
+   }
+
+   ppIRExpr(e);
+   vpanic("iselDblExpr_wrk");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Floating point expressions (32 bit)         ---*/
+/*---------------------------------------------------------*/
+
+/* Compute a 64-bit floating point value into a register, the identity
+   of which is returned.  As with iselIntExpr_R, the reg may be either
+   real or virtual; in any case it must not be changed by subsequent
+   code emitted by the caller.  */
+
+static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselFltExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcFlt32);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(e);
+   vassert(ty == Ity_F32);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+      ARMAModeV* am;
+      HReg res = newVRegF(env);
+      vassert(e->Iex.Load.ty == Ity_F32);
+      am = iselIntExpr_AModeV(env, e->Iex.Load.addr);
+      addInstr(env, ARMInstr_VLdStS(True/*isLoad*/, res, am));
+      return res;
+   }
+
+   if (e->tag == Iex_Get) {
+      // XXX This won't work if offset > 1020 or is not 0 % 4.
+      // In which case we'll have to generate more longwinded code.
+      ARMAModeV* am  = mkARMAModeV(hregARM_R8(), e->Iex.Get.offset);
+      HReg       res = newVRegF(env);
+      addInstr(env, ARMInstr_VLdStS(True/*isLoad*/, res, am));
+      return res;
+   }
+
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+         case Iop_ReinterpI32asF32: {
+            HReg dst = newVRegF(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, ARMInstr_VXferS(True/*toS*/, dst, src));
+            return dst;
+         }
+         case Iop_NegF32: {
+            HReg src = iselFltExpr(env, e->Iex.Unop.arg);
+            HReg dst = newVRegF(env);
+            addInstr(env, ARMInstr_VUnaryS(ARMvfpu_NEG, dst, src));
+            return dst;
+         }
+         case Iop_AbsF32: {
+            HReg src = iselFltExpr(env, e->Iex.Unop.arg);
+            HReg dst = newVRegF(env);
+            addInstr(env, ARMInstr_VUnaryS(ARMvfpu_ABS, dst, src));
+            return dst;
+         }
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+         case Iop_SqrtF32: {
+            /* first arg is rounding mode; we ignore it. */
+            HReg src = iselFltExpr(env, e->Iex.Binop.arg2);
+            HReg dst = newVRegF(env);
+            addInstr(env, ARMInstr_VUnaryS(ARMvfpu_SQRT, dst, src));
+            return dst;
+         }
+         case Iop_F64toF32: {
+            HReg valD = iselDblExpr(env, e->Iex.Binop.arg2);
+            set_VFP_rounding_mode(env, e->Iex.Binop.arg1);
+            HReg valS = newVRegF(env);
+            /* FCVTSD valS, valD */
+            addInstr(env, ARMInstr_VCvtSD(False/*!sToD*/, valS, valD));
+            set_VFP_rounding_default(env);
+            return valS;
+         }
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Triop) {
+      switch (e->Iex.Triop.op) {
+         case Iop_DivF32:
+         case Iop_MulF32:
+         case Iop_AddF32:
+         case Iop_SubF32: {
+            ARMVfpOp op = 0; /*INVALID*/
+            HReg argL = iselFltExpr(env, e->Iex.Triop.arg2);
+            HReg argR = iselFltExpr(env, e->Iex.Triop.arg3);
+            HReg dst  = newVRegF(env);
+            switch (e->Iex.Triop.op) {
+               case Iop_DivF32: op = ARMvfp_DIV; break;
+               case Iop_MulF32: op = ARMvfp_MUL; break;
+               case Iop_AddF32: op = ARMvfp_ADD; break;
+               case Iop_SubF32: op = ARMvfp_SUB; break;
+               default: vassert(0);
+            }
+            addInstr(env, ARMInstr_VAluS(op, dst, argL, argR));
+            return dst;
+         }
+         default:
+            break;
+      }
+   }
+
+   if (e->tag == Iex_Mux0X) {
+      if (ty == Ity_F32
+          && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
+         HReg r8;
+         HReg rX  = iselFltExpr(env, e->Iex.Mux0X.exprX);
+         HReg r0  = iselFltExpr(env, e->Iex.Mux0X.expr0);
+         HReg dst = newVRegF(env);
+         addInstr(env, ARMInstr_VUnaryS(ARMvfpu_COPY, dst, rX));
+         r8 = iselIntExpr_R(env, e->Iex.Mux0X.cond);
+         addInstr(env, ARMInstr_CmpOrTst(False/*!isCmp*/, r8,
+                                         ARMRI84_I84(0xFF,0)));
+         addInstr(env, ARMInstr_VCMovS(ARMcc_EQ, dst, r0));
+         return dst;
+      }
+   }
+
+   ppIRExpr(e);
+   vpanic("iselFltExpr_wrk");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Statements                                  ---*/
+/*---------------------------------------------------------*/
+
+static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+{
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      vex_printf("\n-- ");
+      ppIRStmt(stmt);
+      vex_printf("\n");
+   }
+   switch (stmt->tag) {
+
+   /* --------- STORE --------- */
+   /* little-endian write to memory */
+   case Ist_Store: {
+      IRType    tya  = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
+      IRType    tyd  = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
+      IREndness end  = stmt->Ist.Store.end;
+
+      if (tya != Ity_I32 || end != Iend_LE) 
+         goto stmt_fail;
+
+      if (tyd == Ity_I32) {
+         HReg       rD = iselIntExpr_R(env, stmt->Ist.Store.data);
+         ARMAMode1* am = iselIntExpr_AMode1(env, stmt->Ist.Store.addr);
+         addInstr(env, ARMInstr_LdSt32(False/*!isLoad*/, rD, am));
+         return;
+      }
+      if (tyd == Ity_I16) {
+         HReg       rD = iselIntExpr_R(env, stmt->Ist.Store.data);
+         ARMAMode2* am = iselIntExpr_AMode2(env, stmt->Ist.Store.addr);
+         addInstr(env, ARMInstr_LdSt16(False/*!isLoad*/,
+                                       False/*!isSignedLoad*/, rD, am));
+         return;
+      }
+      if (tyd == Ity_I8) {
+         HReg       rD = iselIntExpr_R(env, stmt->Ist.Store.data);
+         ARMAMode1* am = iselIntExpr_AMode1(env, stmt->Ist.Store.addr);
+         addInstr(env, ARMInstr_LdSt8U(False/*!isLoad*/, rD, am));
+         return;
+      }
+      if (tyd == Ity_I64) {
+         if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+            HReg dD = iselNeon64Expr(env, stmt->Ist.Store.data);
+            ARMAModeN* am = iselIntExpr_AModeN(env, stmt->Ist.Store.addr);
+            addInstr(env, ARMInstr_NLdStD(False, dD, am));
+         } else {
+            HReg rDhi, rDlo, rA;
+            iselInt64Expr(&rDhi, &rDlo, env, stmt->Ist.Store.data);
+            rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
+            addInstr(env, ARMInstr_LdSt32(False/*!load*/, rDhi,
+                                          ARMAMode1_RI(rA,4)));
+            addInstr(env, ARMInstr_LdSt32(False/*!load*/, rDlo,
+                                          ARMAMode1_RI(rA,0)));
+         }
+         return;
+      }
+      if (tyd == Ity_F64) {
+         HReg       dD = iselDblExpr(env, stmt->Ist.Store.data);
+         ARMAModeV* am = iselIntExpr_AModeV(env, stmt->Ist.Store.addr);
+         addInstr(env, ARMInstr_VLdStD(False/*!isLoad*/, dD, am));
+         return;
+      }
+      if (tyd == Ity_F32) {
+         HReg       fD = iselFltExpr(env, stmt->Ist.Store.data);
+         ARMAModeV* am = iselIntExpr_AModeV(env, stmt->Ist.Store.addr);
+         addInstr(env, ARMInstr_VLdStS(False/*!isLoad*/, fD, am));
+         return;
+      }
+      if (tyd == Ity_V128) {
+         HReg       qD = iselNeonExpr(env, stmt->Ist.Store.data);
+         ARMAModeN* am = iselIntExpr_AModeN(env, stmt->Ist.Store.addr);
+         addInstr(env, ARMInstr_NLdStQ(False, qD, am));
+         return;
+      }
+
+      break;
+   }
+
+   /* --------- PUT --------- */
+   /* write guest state, fixed offset */
+   case Ist_Put: {
+       IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
+
+       if (tyd == Ity_I32) {
+           HReg       rD = iselIntExpr_R(env, stmt->Ist.Put.data);
+           ARMAMode1* am = ARMAMode1_RI(hregARM_R8(), stmt->Ist.Put.offset);
+           addInstr(env, ARMInstr_LdSt32(False/*!isLoad*/, rD, am));
+           return;
+       }
+       if (tyd == Ity_I64) {
+          if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+             HReg addr = newVRegI(env);
+             HReg qD = iselNeon64Expr(env, stmt->Ist.Put.data);
+             addInstr(env, ARMInstr_Add32(addr, hregARM_R8(),
+                                                stmt->Ist.Put.offset));
+             addInstr(env, ARMInstr_NLdStD(False, qD, mkARMAModeN_R(addr)));
+          } else {
+             HReg rDhi, rDlo;
+             ARMAMode1* am0 = ARMAMode1_RI(hregARM_R8(),
+                                           stmt->Ist.Put.offset + 0);
+             ARMAMode1* am4 = ARMAMode1_RI(hregARM_R8(),
+                                           stmt->Ist.Put.offset + 4);
+             iselInt64Expr(&rDhi, &rDlo, env, stmt->Ist.Put.data);
+             addInstr(env, ARMInstr_LdSt32(False/*!isLoad*/, rDhi, am4));
+             addInstr(env, ARMInstr_LdSt32(False/*!isLoad*/, rDlo, am0));
+          }
+          return;
+       }
+       if (tyd == Ity_F64) {
+          // XXX This won't work if offset > 1020 or is not 0 % 4.
+          // In which case we'll have to generate more longwinded code.
+          ARMAModeV* am = mkARMAModeV(hregARM_R8(), stmt->Ist.Put.offset);
+          HReg       rD = iselDblExpr(env, stmt->Ist.Put.data);
+          addInstr(env, ARMInstr_VLdStD(False/*!isLoad*/, rD, am));
+          return;
+       }
+       if (tyd == Ity_F32) {
+          // XXX This won't work if offset > 1020 or is not 0 % 4.
+          // In which case we'll have to generate more longwinded code.
+          ARMAModeV* am = mkARMAModeV(hregARM_R8(), stmt->Ist.Put.offset);
+          HReg       rD = iselFltExpr(env, stmt->Ist.Put.data);
+          addInstr(env, ARMInstr_VLdStS(False/*!isLoad*/, rD, am));
+          return;
+       }
+       if (tyd == Ity_V128) {
+          HReg addr = newVRegI(env);
+          HReg qD = iselNeonExpr(env, stmt->Ist.Put.data);
+          addInstr(env, ARMInstr_Add32(addr, hregARM_R8(),
+                                       stmt->Ist.Put.offset));
+          addInstr(env, ARMInstr_NLdStQ(False, qD, mkARMAModeN_R(addr)));
+          return;
+       }
+       break;
+   }
+
+//zz   /* --------- Indexed PUT --------- */
+//zz   /* write guest state, run-time offset */
+//zz   case Ist_PutI: {
+//zz      ARMAMode2* am2
+//zz           = genGuestArrayOffset(
+//zz               env, stmt->Ist.PutI.descr, 
+//zz               stmt->Ist.PutI.ix, stmt->Ist.PutI.bias );
+//zz       
+//zz       IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.PutI.data);
+//zz       
+//zz       if (tyd == Ity_I8) {
+//zz           HReg reg = iselIntExpr_R(env, stmt->Ist.PutI.data);
+//zz           addInstr(env, ARMInstr_StoreB(reg, am2));
+//zz           return;
+//zz       }
+//zz// CAB: Ity_I32, Ity_I16 ?
+//zz       break;
+//zz   }
+
+   /* --------- TMP --------- */
+   /* assign value to temporary */
+   case Ist_WrTmp: {
+      IRTemp tmp = stmt->Ist.WrTmp.tmp;
+      IRType ty = typeOfIRTemp(env->type_env, tmp);
+
+      if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
+         ARMRI84* ri84 = iselIntExpr_RI84(NULL, False,
+                                          env, stmt->Ist.WrTmp.data);
+         HReg     dst  = lookupIRTemp(env, tmp);
+         addInstr(env, ARMInstr_Mov(dst,ri84));
+         return;
+      }
+      if (ty == Ity_I1) {
+         HReg        dst  = lookupIRTemp(env, tmp);
+         ARMCondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
+         addInstr(env, ARMInstr_Mov(dst, ARMRI84_I84(0,0)));
+         addInstr(env, ARMInstr_CMov(cond, dst, ARMRI84_I84(1,0)));
+         return;
+      }
+      if (ty == Ity_I64) {
+         if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+            HReg src = iselNeon64Expr(env, stmt->Ist.WrTmp.data);
+            HReg dst = lookupIRTemp(env, tmp);
+            addInstr(env, ARMInstr_NUnary(ARMneon_COPY, dst, src, 4, False));
+         } else {
+            HReg rHi, rLo, dstHi, dstLo;
+            iselInt64Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
+            lookupIRTemp64( &dstHi, &dstLo, env, tmp);
+            addInstr(env, mk_iMOVds_RR(dstHi, rHi) );
+            addInstr(env, mk_iMOVds_RR(dstLo, rLo) );
+         }
+         return;
+      }
+      if (ty == Ity_F64) {
+         HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
+         HReg dst = lookupIRTemp(env, tmp);
+         addInstr(env, ARMInstr_VUnaryD(ARMvfpu_COPY, dst, src));
+         return;
+      }
+      if (ty == Ity_F32) {
+         HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
+         HReg dst = lookupIRTemp(env, tmp);
+         addInstr(env, ARMInstr_VUnaryS(ARMvfpu_COPY, dst, src));
+         return;
+      }
+      if (ty == Ity_V128) {
+         HReg src = iselNeonExpr(env, stmt->Ist.WrTmp.data);
+         HReg dst = lookupIRTemp(env, tmp);
+         addInstr(env, ARMInstr_NUnary(ARMneon_COPY, dst, src, 4, True));
+         return;
+      }
+      break;
+   }
+
+   /* --------- Call to DIRTY helper --------- */
+   /* call complex ("dirty") helper function */
+   case Ist_Dirty: {
+      IRType   retty;
+      IRDirty* d = stmt->Ist.Dirty.details;
+      Bool     passBBP = False;
+
+      if (d->nFxState == 0)
+         vassert(!d->needsBBP);
+
+      passBBP = toBool(d->nFxState > 0 && d->needsBBP);
+
+      /* Marshal args, do the call, clear stack. */
+      Bool ok = doHelperCall( env, passBBP, d->guard, d->cee, d->args );
+      if (!ok)
+         break; /* will go to stmt_fail: */
+
+      /* Now figure out what to do with the returned value, if any. */
+      if (d->tmp == IRTemp_INVALID)
+         /* No return value.  Nothing to do. */
+         return;
+
+      retty = typeOfIRTemp(env->type_env, d->tmp);
+
+      if (retty == Ity_I64) {
+         if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+            HReg tmp = lookupIRTemp(env, d->tmp);
+            addInstr(env, ARMInstr_VXferD(True, tmp, hregARM_R1(),
+                                                     hregARM_R0()));
+         } else {
+            HReg dstHi, dstLo;
+            /* The returned value is in r1:r0.  Park it in the
+               register-pair associated with tmp. */
+            lookupIRTemp64( &dstHi, &dstLo, env, d->tmp);
+            addInstr(env, mk_iMOVds_RR(dstHi, hregARM_R1()) );
+            addInstr(env, mk_iMOVds_RR(dstLo, hregARM_R0()) );
+         }
+         return;
+      }
+      if (retty == Ity_I32 || retty == Ity_I16 || retty == Ity_I8) {
+         /* The returned value is in r0.  Park it in the register
+            associated with tmp. */
+         HReg dst = lookupIRTemp(env, d->tmp);
+         addInstr(env, mk_iMOVds_RR(dst, hregARM_R0()) );
+         return;
+      }
+
+      break;
+   }
+
+   /* --------- Load Linked and Store Conditional --------- */
+   case Ist_LLSC: {
+      if (stmt->Ist.LLSC.storedata == NULL) {
+         /* LL */
+         IRTemp res = stmt->Ist.LLSC.result;
+         IRType ty  = typeOfIRTemp(env->type_env, res);
+         if (ty == Ity_I32 || ty == Ity_I8) {
+            Int  szB   = 0;
+            HReg r_dst = lookupIRTemp(env, res);
+            HReg raddr = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
+            switch (ty) {
+               case Ity_I8:  szB = 1; break;
+               case Ity_I32: szB = 4; break;
+               default:      vassert(0);
+            }
+            addInstr(env, mk_iMOVds_RR(hregARM_R1(), raddr));
+            addInstr(env, ARMInstr_LdrEX(szB));
+            addInstr(env, mk_iMOVds_RR(r_dst, hregARM_R0()));
+            return;
+         }
+         /* else fall thru; is unhandled */
+      } else {
+         /* SC */
+         IRTemp res = stmt->Ist.LLSC.result;
+         IRType ty  = typeOfIRTemp(env->type_env, res);
+         IRType tyd = typeOfIRExpr(env->type_env, stmt->Ist.LLSC.storedata);
+         vassert(ty == Ity_I1);
+         if (tyd == Ity_I32 || tyd == Ity_I8) {
+            Int  szB     = 0;
+            HReg r_res   = lookupIRTemp(env, res);
+            HReg rD      = iselIntExpr_R(env, stmt->Ist.LLSC.storedata);
+            HReg rA      = iselIntExpr_R(env, stmt->Ist.LLSC.addr);
+            ARMRI84* one = ARMRI84_I84(1,0);
+            switch (tyd) {
+               case Ity_I8:  szB = 1; break;
+               case Ity_I32: szB = 4; break;
+               default:      vassert(0);
+            }
+            addInstr(env, mk_iMOVds_RR(hregARM_R1(), rD));
+            addInstr(env, mk_iMOVds_RR(hregARM_R2(), rA));
+            addInstr(env, ARMInstr_StrEX(szB));
+            /* now r0 is 1 if failed, 0 if success.  Change to IR
+               conventions (0 is fail, 1 is success).  Also transfer
+               result to r_res. */
+            addInstr(env, ARMInstr_Alu(ARMalu_XOR, r_res, hregARM_R0(), one));
+            /* And be conservative -- mask off all but the lowest bit */
+            addInstr(env, ARMInstr_Alu(ARMalu_AND, r_res, r_res, one));
+            return;
+         }
+         /* else fall thru; is unhandled */
+      }
+      break;
+   }
+
+   /* --------- MEM FENCE --------- */
+   case Ist_MBE:
+      switch (stmt->Ist.MBE.event) {
+         case Imbe_Fence:
+            addInstr(env,ARMInstr_MFence());
+            return;
+         default:
+            break;
+      }
+      break;
+
+   /* --------- INSTR MARK --------- */
+   /* Doesn't generate any executable code ... */
+   case Ist_IMark:
+       return;
+
+   /* --------- NO-OP --------- */
+   case Ist_NoOp:
+       return;
+
+   /* --------- EXIT --------- */
+   case Ist_Exit: {
+      HReg        gnext;
+      ARMCondCode cc;
+      if (stmt->Ist.Exit.dst->tag != Ico_U32)
+         vpanic("isel_arm: Ist_Exit: dst is not a 32-bit value");
+      gnext = iselIntExpr_R(env, IRExpr_Const(stmt->Ist.Exit.dst));
+      cc    = iselCondCode(env, stmt->Ist.Exit.guard);
+      addInstr(env, mk_iMOVds_RR(hregARM_R14(), env->savedLR));
+      addInstr(env, ARMInstr_Goto(stmt->Ist.Exit.jk, cc, gnext));
+      return;
+   }
+
+   default: break;
+   }
+  stmt_fail:
+   ppIRStmt(stmt);
+   vpanic("iselStmt");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Basic block terminators (Nexts)             ---*/
+/*---------------------------------------------------------*/
+
+static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
+{
+   HReg rDst;
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      vex_printf("\n-- goto {");
+      ppIRJumpKind(jk);
+      vex_printf("} ");
+      ppIRExpr(next);
+      vex_printf("\n");
+   }
+   rDst = iselIntExpr_R(env, next);
+   addInstr(env, mk_iMOVds_RR(hregARM_R14(), env->savedLR));
+   addInstr(env, ARMInstr_Goto(jk, ARMcc_AL, rDst));
+}
+
+
+/*---------------------------------------------------------*/
+/*--- Insn selector top-level                           ---*/
+/*---------------------------------------------------------*/
+
+/* Translate an entire SB to arm code. */
+
+HInstrArray* iselSB_ARM ( IRSB* bb, VexArch      arch_host,
+                                    VexArchInfo* archinfo_host,
+                                    VexAbiInfo*  vbi/*UNUSED*/ )
+{
+   Int      i, j;
+   HReg     hreg, hregHI;
+   ISelEnv* env;
+   UInt     hwcaps_host = archinfo_host->hwcaps;
+   Bool     neon = False;
+   static UInt counter = 0;
+
+   /* sanity ... */
+   vassert(arch_host == VexArchARM);
+
+   /* hwcaps should not change from one ISEL call to another. */
+   arm_hwcaps = hwcaps_host;
+
+   /* Make up an initial environment to use. */
+   env = LibVEX_Alloc(sizeof(ISelEnv));
+   env->vreg_ctr = 0;
+
+   /* Set up output code array. */
+   env->code = newHInstrArray();
+    
+   /* Copy BB's type env. */
+   env->type_env = bb->tyenv;
+
+   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
+      change as we go along. */
+   env->n_vregmap = bb->tyenv->types_used;
+   env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
+   env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
+
+   /* For each IR temporary, allocate a suitably-kinded virtual
+      register. */
+   j = 0;
+   for (i = 0; i < env->n_vregmap; i++) {
+      hregHI = hreg = INVALID_HREG;
+      switch (bb->tyenv->types[i]) {
+         case Ity_I1:
+         case Ity_I8:
+         case Ity_I16:
+         case Ity_I32:  hreg   = mkHReg(j++, HRcInt32, True); break;
+         case Ity_I64:
+            if (arm_hwcaps & VEX_HWCAPS_ARM_NEON) {
+               hreg = mkHReg(j++, HRcFlt64, True);
+               neon = True;
+            } else {
+               hregHI = mkHReg(j++, HRcInt32, True);
+               hreg   = mkHReg(j++, HRcInt32, True);
+            }
+            break;
+         case Ity_F32:  hreg   = mkHReg(j++, HRcFlt32, True); break;
+         case Ity_F64:  hreg   = mkHReg(j++, HRcFlt64, True); break;
+         case Ity_V128: hreg   = mkHReg(j++, HRcVec128, True);
+                        neon   = True; break;
+         default: ppIRType(bb->tyenv->types[i]);
+                  vpanic("iselBB: IRTemp type");
+      }
+      env->vregmap[i]   = hreg;
+      env->vregmapHI[i] = hregHI;
+   }
+   env->vreg_ctr = j;
+
+   /* Keep a copy of the link reg, since any call to a helper function
+      will trash it, and we can't get back to the dispatcher once that
+      happens. */
+   env->savedLR = newVRegI(env);
+   addInstr(env, mk_iMOVds_RR(env->savedLR, hregARM_R14()));
+
+   /* Ok, finally we can iterate over the statements. */
+   for (i = 0; i < bb->stmts_used; i++)
+      iselStmt(env,bb->stmts[i]);
+
+   iselNext(env,bb->next,bb->jumpkind);
+
+   /* record the number of vregs we used. */
+   env->code->n_vregs = env->vreg_ctr;
+   counter++;
+   return env->code;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                     host_arm_isel.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_generic_reg_alloc2.c b/VEX/priv/host_generic_reg_alloc2.c
new file mode 100644
index 0000000..48303ff
--- /dev/null
+++ b/VEX/priv/host_generic_reg_alloc2.c

@@ -0,0 +1,1549 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                 host_reg_alloc2.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex.h"
+
+#include "main_util.h"
+#include "host_generic_regs.h"
+
+/* Set to 1 for lots of debugging output. */
+#define DEBUG_REGALLOC 0
+
+
+/* TODO 27 Oct 04:
+
+   Better consistency checking from what isMove tells us.
+
+   We can possibly do V-V coalescing even when the src is spilled,
+   providing we can arrange for the dst to have the same spill slot.
+
+   Note that state[].hreg is the same as the available real regs.
+
+   Generally rationalise data structures.  */
+
+
+/* Records information on virtual register live ranges.  Computed once
+   and remains unchanged after that. */
+typedef
+   struct {
+      /* Becomes live for the first time after this insn ... */
+      Short live_after;
+      /* Becomes dead for the last time before this insn ... */
+      Short dead_before;
+      /* The "home" spill slot, if needed.  Never changes. */
+      Short spill_offset;
+      Short spill_size;
+      /* What kind of register this is. */
+      HRegClass reg_class;
+   }
+   VRegLR;
+
+
+/* Records information on real-register live ranges.  Computed once
+   and remains unchanged after that. */
+typedef
+   struct {
+      HReg rreg;
+      /* Becomes live after this insn ... */
+      Short live_after;
+      /* Becomes dead before this insn ... */
+      Short dead_before;
+   }
+   RRegLR;
+
+
+/* An array of the following structs (rreg_state) comprises the
+   running state of the allocator.  It indicates what the current
+   disposition of each allocatable real register is.  The array gets
+   updated as the allocator processes instructions. */
+typedef
+   struct {
+      /* ------ FIELDS WHICH DO NOT CHANGE ------ */
+      /* Which rreg is this for? */
+      HReg rreg;
+      /* Is this involved in any HLRs?  (only an optimisation hint) */
+      Bool has_hlrs;
+      /* ------ FIELDS WHICH DO CHANGE ------ */
+      /* 6 May 07: rearranged fields below so the whole struct fits
+         into 16 bytes on both x86 and amd64. */
+      /* Used when .disp == Bound and we are looking for vregs to
+         spill. */
+      Bool is_spill_cand;
+      /* Optimisation: used when .disp == Bound.  Indicates when the
+         rreg has the same value as the spill slot for the associated
+         vreg.  Is safely left at False, and becomes True after a
+         spill store or reload for this rreg. */
+      Bool eq_spill_slot;
+      /* What's it's current disposition? */
+      enum { Free,     /* available for use */
+             Unavail,  /* in a real-reg live range */
+             Bound     /* in use (holding value of some vreg) */
+           }
+           disp;
+      /* If .disp == Bound, what vreg is it bound to? */
+      HReg vreg;
+   }
+   RRegState;
+
+
+/* The allocator also maintains a redundant array of indexes
+   (vreg_state) from vreg numbers back to entries in rreg_state.  It
+   is redundant because iff vreg_state[i] == j then
+   hregNumber(rreg_state[j].vreg) == i -- that is, the two entries
+   point at each other.  The purpose of this is to speed up activities
+   which involve looking for a particular vreg: there is no need to
+   scan the rreg_state looking for it, just index directly into
+   vreg_state.  The FAQ "does this vreg already have an associated
+   rreg" is the main beneficiary.  
+
+   To indicate, in vreg_state[i], that a given vreg is not currently
+   associated with any rreg, that entry can be set to INVALID_RREG_NO.
+
+   Because the vreg_state entries are signed Shorts, the max number
+   of vregs that can be handed by regalloc is 32767.
+*/
+
+#define INVALID_RREG_NO ((Short)(-1))
+
+#define IS_VALID_VREGNO(_zz) ((_zz) >= 0 && (_zz) < n_vregs)
+#define IS_VALID_RREGNO(_zz) ((_zz) >= 0 && (_zz) < n_rregs)
+
+
+/* Does this instruction mention a particular reg? */
+static Bool instrMentionsReg ( 
+   void (*getRegUsage) (HRegUsage*, HInstr*, Bool),
+   HInstr* instr, 
+   HReg r,
+   Bool mode64
+)
+{
+   Int       i;
+   HRegUsage reg_usage;
+   (*getRegUsage)(&reg_usage, instr, mode64);
+   for (i = 0; i < reg_usage.n_used; i++)
+      if (reg_usage.hreg[i] == r)
+         return True;
+   return False;
+}
+
+
+/* Search forward from some given point in the incoming instruction
+   sequence.  Point is to select a virtual register to spill, by
+   finding the vreg which is mentioned as far ahead as possible, in
+   the hope that this will minimise the number of consequent reloads.
+
+   Only do the search for vregs which are Bound in the running state,
+   and for which the .is_spill_cand field is set.  This allows the
+   caller to arbitrarily restrict the set of spill candidates to be
+   considered.
+
+   Returns an index into the state array indicating the (v,r) pair to
+   spill, or -1 if none was found.  */
+static
+Int findMostDistantlyMentionedVReg ( 
+   void (*getRegUsage) (HRegUsage*, HInstr*, Bool),
+   HInstrArray* instrs_in,
+   Int          search_from_instr,
+   RRegState*   state,
+   Int          n_state,
+   Bool         mode64
+)
+{
+   Int k, m;
+   Int furthest_k = -1;
+   Int furthest   = -1;
+   vassert(search_from_instr >= 0);
+   for (k = 0; k < n_state; k++) {
+      if (!state[k].is_spill_cand)
+         continue;
+      vassert(state[k].disp == Bound);
+      for (m = search_from_instr; m < instrs_in->arr_used; m++) {
+         if (instrMentionsReg(getRegUsage, 
+                              instrs_in->arr[m], state[k].vreg, mode64))
+            break;
+      }
+      if (m > furthest) {
+         furthest   = m;
+         furthest_k = k;
+      }
+   }
+   return furthest_k;
+}
+
+
+/* Check that this vreg has been assigned a sane spill offset. */
+static inline void sanity_check_spill_offset ( VRegLR* vreg )
+{
+   if (vreg->reg_class == HRcVec128 || vreg->reg_class == HRcFlt64) {
+      vassert(0 == ((UShort)vreg->spill_offset % 16));
+   } else {
+      vassert(0 == ((UShort)vreg->spill_offset % 8));
+   }
+}
+
+
+/* Double the size of the real-reg live-range array, if needed. */
+static void ensureRRLRspace ( RRegLR** info, Int* size, Int used )
+{
+   Int     k;
+   RRegLR* arr2;
+   if (used < *size) return;
+   if (0)
+      vex_printf("ensureRRISpace: %d -> %d\n", *size, 2 * *size);
+   vassert(used == *size);
+   arr2 = LibVEX_Alloc(2 * *size * sizeof(RRegLR));
+   for (k = 0; k < *size; k++)
+      arr2[k] = (*info)[k];
+   *size *= 2;
+   *info = arr2;
+}
+
+
+/* Sort an array of RRegLR entries by either the .live_after or
+   .dead_before fields.  This is performance-critical. */
+static void sortRRLRarray ( RRegLR* arr, 
+                            Int size, Bool by_live_after )
+{
+   Int    incs[14] = { 1, 4, 13, 40, 121, 364, 1093, 3280,
+                       9841, 29524, 88573, 265720,
+                       797161, 2391484 };
+   Int    lo = 0;
+   Int    hi = size-1;
+   Int    i, j, h, bigN, hp;
+   RRegLR v;
+
+   vassert(size >= 0);
+   if (size == 0)
+      return;
+
+   bigN = hi - lo + 1; if (bigN < 2) return;
+   hp = 0; while (hp < 14 && incs[hp] < bigN) hp++; hp--;
+
+   if (by_live_after) {
+
+      for ( ; hp >= 0; hp--) {
+         h = incs[hp];
+         for (i = lo + h; i <= hi; i++) {
+            v = arr[i];
+            j = i;
+            while (arr[j-h].live_after > v.live_after) {
+               arr[j] = arr[j-h];
+               j = j - h;
+               if (j <= (lo + h - 1)) break;
+            }
+            arr[j] = v;
+         }
+      }
+
+   } else {
+
+      for ( ; hp >= 0; hp--) {
+         h = incs[hp];
+         for (i = lo + h; i <= hi; i++) {
+            v = arr[i];
+            j = i;
+            while (arr[j-h].dead_before > v.dead_before) {
+               arr[j] = arr[j-h];
+               j = j - h;
+               if (j <= (lo + h - 1)) break;
+            }
+            arr[j] = v;
+         }
+      }
+
+   }
+}
+
+
+/* A target-independent register allocator.  Requires various
+   functions which it uses to deal abstractly with instructions and
+   registers, since it cannot have any target-specific knowledge.
+
+   Returns a new list of instructions, which, as a result of the
+   behaviour of mapRegs, will be in-place modifications of the
+   original instructions.
+
+   Requires that the incoming code has been generated using
+   vreg numbers 0, 1 .. n_vregs-1.  Appearance of a vreg outside
+   that range is a checked run-time error.
+
+   Takes an expandable array of pointers to unallocated insns.
+   Returns an expandable array of pointers to allocated insns.
+*/
+HInstrArray* doRegisterAllocation (
+
+   /* Incoming virtual-registerised code. */ 
+   HInstrArray* instrs_in,
+
+   /* An array listing all the real registers the allocator may use,
+      in no particular order. */
+   HReg* available_real_regs,
+   Int   n_available_real_regs,
+
+   /* Return True iff the given insn is a reg-reg move, in which
+      case also return the src and dst regs. */
+   Bool (*isMove) ( HInstr*, HReg*, HReg* ),
+
+   /* Get info about register usage in this insn. */
+   void (*getRegUsage) ( HRegUsage*, HInstr*, Bool ),
+
+   /* Apply a reg-reg mapping to an insn. */
+   void (*mapRegs) ( HRegRemap*, HInstr*, Bool ),
+
+   /* Return one, or, if we're unlucky, two insn(s) to spill/restore a
+      real reg to a spill slot byte offset.  The two leading HInstr**
+      args are out parameters, through which the generated insns are
+      returned.  Also (optionally) a 'directReload' function, which
+      attempts to replace a given instruction by one which reads
+      directly from a specified spill slot.  May be NULL, in which
+      case the optimisation is not attempted. */
+   void    (*genSpill)  ( HInstr**, HInstr**, HReg, Int, Bool ),
+   void    (*genReload) ( HInstr**, HInstr**, HReg, Int, Bool ),
+   HInstr* (*directReload) ( HInstr*, HReg, Short ),
+   Int     guest_sizeB,
+
+   /* For debug printing only. */
+   void (*ppInstr) ( HInstr*, Bool ),
+   void (*ppReg) ( HReg ),
+
+   /* 32/64bit mode */
+   Bool mode64
+)
+{
+#  define N_SPILL64S  (LibVEX_N_SPILL_BYTES / 8)
+
+   const Bool eq_spill_opt = True;
+
+   /* Iterators and temporaries. */
+   Int       ii, j, k, m, spillee, k_suboptimal;
+   HReg      rreg, vreg, vregS, vregD;
+   HRegUsage reg_usage;
+
+   /* Info on vregs and rregs.  Computed once and remains
+      unchanged. */
+   Int     n_vregs;
+   VRegLR* vreg_lrs; /* [0 .. n_vregs-1] */
+
+   /* We keep two copies of the real-reg live range info, one sorted
+      by .live_after and the other by .dead_before.  First the
+      unsorted info is created in the _la variant is copied into the
+      _db variant.  Once that's done both of them are sorted. 
+      We also need two integer cursors which record the next
+      location in the two arrays to consider. */
+   RRegLR* rreg_lrs_la;
+   RRegLR* rreg_lrs_db;
+   Int     rreg_lrs_size;
+   Int     rreg_lrs_used;
+   Int     rreg_lrs_la_next;
+   Int     rreg_lrs_db_next;
+
+   /* Used when constructing vreg_lrs (for allocating stack
+      slots). */
+   Int ss_busy_until_before[N_SPILL64S];
+
+   /* Used when constructing rreg_lrs. */
+   Int* rreg_live_after;
+   Int* rreg_dead_before;
+
+   /* Running state of the core allocation algorithm. */
+   RRegState* rreg_state;  /* [0 .. n_rregs-1] */
+   Int        n_rregs;
+
+   /* .. and the redundant backward map */
+   /* Each value is 0 .. n_rregs-1 or is INVALID_RREG_NO.
+      This inplies n_rregs must be <= 32768. */
+   Short*     vreg_state;  /* [0 .. n_vregs-1] */
+
+   /* The vreg -> rreg map constructed and then applied to each
+      instr. */
+   HRegRemap remap;
+
+   /* The output array of instructions. */
+   HInstrArray* instrs_out;
+
+   /* Sanity checks are expensive.  They are only done periodically,
+      not at each insn processed. */
+   Bool do_sanity_check;
+
+   vassert(0 == (guest_sizeB % 16));
+   vassert(0 == (LibVEX_N_SPILL_BYTES % 16));
+   vassert(0 == (N_SPILL64S % 2));
+
+   /* The live range numbers are signed shorts, and so limiting the
+      number of insns to 10000 comfortably guards against them
+      overflowing 32k. */
+   vassert(instrs_in->arr_used <= 10000);
+
+#  define INVALID_INSTRNO (-2)
+
+#  define EMIT_INSTR(_instr)                  \
+      do {                                    \
+        HInstr* _tmp = (_instr);              \
+        if (DEBUG_REGALLOC) {                 \
+           vex_printf("**  ");                \
+           (*ppInstr)(_tmp, mode64);          \
+           vex_printf("\n\n");                \
+        }                                     \
+        addHInstr ( instrs_out, _tmp );       \
+      } while (0)
+
+#   define PRINT_STATE						   \
+      do {							   \
+         Int z, q;						   \
+         for (z = 0; z < n_rregs; z++) {			   \
+            vex_printf("  rreg_state[%2d] = ", z);		   \
+            (*ppReg)(rreg_state[z].rreg);	       		   \
+            vex_printf("  \t");					   \
+            switch (rreg_state[z].disp) {			   \
+               case Free:    vex_printf("Free\n"); break;	   \
+               case Unavail: vex_printf("Unavail\n"); break;	   \
+               case Bound:   vex_printf("BoundTo "); 		   \
+                             (*ppReg)(rreg_state[z].vreg);	   \
+                             vex_printf("\n"); break;		   \
+            }							   \
+         }							   \
+         vex_printf("\n  vreg_state[0 .. %d]:\n    ", n_vregs-1);  \
+         q = 0;                                                    \
+         for (z = 0; z < n_vregs; z++) {                           \
+            if (vreg_state[z] == INVALID_RREG_NO)                  \
+               continue;                                           \
+            vex_printf("[%d] -> %d   ", z, vreg_state[z]);         \
+            q++;                                                   \
+            if (q > 0 && (q % 6) == 0)                             \
+               vex_printf("\n    ");                               \
+         }                                                         \
+         vex_printf("\n");                                         \
+      } while (0)
+
+
+   /* --------- Stage 0: set up output array --------- */
+   /* --------- and allocate/initialise running state. --------- */
+
+   instrs_out = newHInstrArray();
+
+   /* ... and initialise running state. */
+   /* n_rregs is no more than a short name for n_available_real_regs. */
+   n_rregs = n_available_real_regs;
+   n_vregs = instrs_in->n_vregs;
+
+   /* If this is not so, vreg_state entries will overflow. */
+   vassert(n_vregs < 32767);
+
+   rreg_state = LibVEX_Alloc(n_rregs * sizeof(RRegState));
+   vreg_state = LibVEX_Alloc(n_vregs * sizeof(Short));
+
+   for (j = 0; j < n_rregs; j++) {
+      rreg_state[j].rreg          = available_real_regs[j];
+      rreg_state[j].has_hlrs      = False;
+      rreg_state[j].disp          = Free;
+      rreg_state[j].vreg          = INVALID_HREG;
+      rreg_state[j].is_spill_cand = False;
+      rreg_state[j].eq_spill_slot = False;
+   }
+
+   for (j = 0; j < n_vregs; j++)
+      vreg_state[j] = INVALID_RREG_NO;
+
+
+   /* --------- Stage 1: compute vreg live ranges. --------- */
+   /* --------- Stage 2: compute rreg live ranges. --------- */
+
+   /* ------ start of SET UP TO COMPUTE VREG LIVE RANGES ------ */
+
+   /* This is relatively simple, because (1) we only seek the complete
+      end-to-end live range of each vreg, and are not interested in
+      any holes in it, and (2) the vregs are conveniently numbered 0
+      .. n_vregs-1, so we can just dump the results in a
+      pre-allocated array. */
+
+   vreg_lrs = NULL;
+   if (n_vregs > 0)
+      vreg_lrs = LibVEX_Alloc(sizeof(VRegLR) * n_vregs);
+
+   for (j = 0; j < n_vregs; j++) {
+      vreg_lrs[j].live_after     = INVALID_INSTRNO;
+      vreg_lrs[j].dead_before    = INVALID_INSTRNO;
+      vreg_lrs[j].spill_offset   = 0;
+      vreg_lrs[j].spill_size     = 0;
+      vreg_lrs[j].reg_class      = HRcINVALID;
+   }
+
+   /* ------ end of SET UP TO COMPUTE VREG LIVE RANGES ------ */
+
+   /* ------ start of SET UP TO COMPUTE RREG LIVE RANGES ------ */
+
+   /* This is more complex than Stage 1, because we need to compute
+      exactly all the live ranges of all the allocatable real regs,
+      and we don't know in advance how many there will be. */
+
+   rreg_lrs_used = 0;
+   rreg_lrs_size = 4;
+   rreg_lrs_la = LibVEX_Alloc(rreg_lrs_size * sizeof(RRegLR));
+   rreg_lrs_db = NULL; /* we'll create this later */
+
+   /* We'll need to track live range start/end points seperately for
+      each rreg.  Sigh. */
+   vassert(n_available_real_regs > 0);
+   rreg_live_after  = LibVEX_Alloc(n_available_real_regs * sizeof(Int));
+   rreg_dead_before = LibVEX_Alloc(n_available_real_regs * sizeof(Int));
+
+   for (j = 0; j < n_available_real_regs; j++) {
+      rreg_live_after[j] = 
+      rreg_dead_before[j] = INVALID_INSTRNO;
+   }
+
+   /* ------ end of SET UP TO COMPUTE RREG LIVE RANGES ------ */
+
+   /* ------ start of ITERATE OVER INSNS ------ */
+
+   for (ii = 0; ii < instrs_in->arr_used; ii++) {
+
+      (*getRegUsage)( &reg_usage, instrs_in->arr[ii], mode64 );
+
+#     if 0
+      vex_printf("\n%d  stage1: ", ii);
+      (*ppInstr)(instrs_in->arr[ii], mode64);
+      vex_printf("\n");
+      ppHRegUsage(&reg_usage);
+#     endif
+
+      /* ------ start of DEAL WITH VREG LIVE RANGES ------ */
+
+      /* for each reg mentioned in the insn ... */
+      for (j = 0; j < reg_usage.n_used; j++) {
+
+         vreg = reg_usage.hreg[j];
+         /* only interested in virtual registers right now. */
+         if (!hregIsVirtual(vreg))
+            continue;
+         k = hregNumber(vreg);
+         if (k < 0 || k >= n_vregs) {
+            vex_printf("\n");
+            (*ppInstr)(instrs_in->arr[ii], mode64);
+            vex_printf("\n");
+            vex_printf("vreg %d, n_vregs %d\n", k, n_vregs);
+            vpanic("doRegisterAllocation: out-of-range vreg");
+         }
+
+         /* Take the opportunity to note its regclass.  We'll need
+            that when allocating spill slots. */
+         if (vreg_lrs[k].reg_class == HRcINVALID) {
+            /* First mention of this vreg. */
+            vreg_lrs[k].reg_class = hregClass(vreg);
+         } else {
+            /* Seen it before, so check for consistency. */
+            vassert(vreg_lrs[k].reg_class == hregClass(vreg));
+         }
+
+         /* Now consider live ranges. */
+         switch (reg_usage.mode[j]) {
+            case HRmRead: 
+               if (vreg_lrs[k].live_after == INVALID_INSTRNO) {
+                  vex_printf("\n\nOFFENDING VREG = %d\n", k);
+                  vpanic("doRegisterAllocation: "
+                         "first event for vreg is Read");
+               }
+               vreg_lrs[k].dead_before = toShort(ii + 1);
+               break;
+            case HRmWrite:
+               if (vreg_lrs[k].live_after == INVALID_INSTRNO)
+                  vreg_lrs[k].live_after = toShort(ii);
+               vreg_lrs[k].dead_before = toShort(ii + 1);
+               break;
+            case HRmModify:
+               if (vreg_lrs[k].live_after == INVALID_INSTRNO) {
+                  vex_printf("\n\nOFFENDING VREG = %d\n", k);
+                  vpanic("doRegisterAllocation: "
+                         "first event for vreg is Modify");
+               }
+               vreg_lrs[k].dead_before = toShort(ii + 1);
+               break;
+            default:
+               vpanic("doRegisterAllocation(1)");
+         } /* switch */
+
+      } /* iterate over registers */
+
+      /* ------ end of DEAL WITH VREG LIVE RANGES ------ */
+
+      /* ------ start of DEAL WITH RREG LIVE RANGES ------ */
+
+      /* for each reg mentioned in the insn ... */
+      for (j = 0; j < reg_usage.n_used; j++) {
+
+         /* Dummy initialisations of flush_la and flush_db to avoid
+            possible bogus uninit-var warnings from gcc. */
+         Int  flush_la = INVALID_INSTRNO, flush_db = INVALID_INSTRNO;
+         Bool flush;
+
+         rreg = reg_usage.hreg[j];
+
+         /* only interested in real registers right now. */
+         if (hregIsVirtual(rreg))
+            continue;
+
+         /* Furthermore, we're not interested in this rreg unless it's
+            one of the allocatable ones.  For example, it could be a
+            stack pointer register, or some other register beyond our
+            control, in which case we should just ignore it. */
+         for (k = 0; k < n_available_real_regs; k++)
+            if (available_real_regs[k] == rreg)
+               break;
+         if (k == n_available_real_regs) 
+            continue; /* not found -- ignore. */
+         flush = False;
+         switch (reg_usage.mode[j]) {
+            case HRmWrite:
+               flush_la = rreg_live_after[k];
+               flush_db = rreg_dead_before[k];
+               if (flush_la != INVALID_INSTRNO 
+                   && flush_db != INVALID_INSTRNO)
+                  flush = True;
+               rreg_live_after[k]  = ii;
+               rreg_dead_before[k] = ii+1;
+               break;
+            case HRmRead:
+               if (rreg_live_after[k] == INVALID_INSTRNO) {
+                  vex_printf("\nOFFENDING RREG = ");
+                  (*ppReg)(available_real_regs[k]);
+                  vex_printf("\n");
+                  vex_printf("\nOFFENDING instr = ");
+                  (*ppInstr)(instrs_in->arr[ii], mode64);
+                  vex_printf("\n");
+                  vpanic("doRegisterAllocation: "
+                         "first event for rreg is Read");
+               }
+               rreg_dead_before[k] = ii+1;
+               break;
+            case HRmModify:
+               if (rreg_live_after[k] == INVALID_INSTRNO) {
+                  vex_printf("\nOFFENDING RREG = ");
+                  (*ppReg)(available_real_regs[k]);
+                  vex_printf("\n");
+                  vex_printf("\nOFFENDING instr = ");
+                  (*ppInstr)(instrs_in->arr[ii], mode64);
+                  vex_printf("\n");
+                  vpanic("doRegisterAllocation: "
+                         "first event for rreg is Modify");
+               }
+               rreg_dead_before[k] = ii+1;
+               break;
+            default:
+               vpanic("doRegisterAllocation(2)");
+         }
+
+         if (flush) {
+            vassert(flush_la != INVALID_INSTRNO);
+            vassert(flush_db != INVALID_INSTRNO);
+            ensureRRLRspace(&rreg_lrs_la, &rreg_lrs_size, rreg_lrs_used);
+            if (0) 
+               vex_printf("FLUSH 1 (%d,%d)\n", flush_la, flush_db);
+            rreg_lrs_la[rreg_lrs_used].rreg        = rreg;
+            rreg_lrs_la[rreg_lrs_used].live_after  = toShort(flush_la);
+            rreg_lrs_la[rreg_lrs_used].dead_before = toShort(flush_db);
+            rreg_lrs_used++;
+         }
+
+      } /* iterate over regs in the instr */
+
+      /* ------ end of DEAL WITH RREG LIVE RANGES ------ */
+
+   } /* iterate over insns */
+
+   /* ------ end of ITERATE OVER INSNS ------ */
+
+   /* ------ start of FINALISE RREG LIVE RANGES ------ */
+
+   /* Now finish up any live ranges left over. */
+   for (j = 0; j < n_available_real_regs; j++) {
+
+#     if 0
+      vex_printf("residual %d:  %d %d\n", j, rreg_live_after[j],
+                                             rreg_dead_before[j]);
+#     endif 
+      vassert( (rreg_live_after[j] == INVALID_INSTRNO 
+               && rreg_dead_before[j] == INVALID_INSTRNO)
+              ||
+              (rreg_live_after[j] != INVALID_INSTRNO 
+               && rreg_dead_before[j] != INVALID_INSTRNO)
+            );
+
+      if (rreg_live_after[j] == INVALID_INSTRNO)
+         continue;
+
+      ensureRRLRspace(&rreg_lrs_la, &rreg_lrs_size, rreg_lrs_used);
+      if (0)
+         vex_printf("FLUSH 2 (%d,%d)\n", 
+                    rreg_live_after[j], rreg_dead_before[j]);
+      rreg_lrs_la[rreg_lrs_used].rreg        = available_real_regs[j];
+      rreg_lrs_la[rreg_lrs_used].live_after  = toShort(rreg_live_after[j]);
+      rreg_lrs_la[rreg_lrs_used].dead_before = toShort(rreg_dead_before[j]);
+      rreg_lrs_used++;
+   }
+
+   /* Compute summary hints for choosing real regs.  If a real reg is
+      involved in a hard live range, record that fact in the fixed
+      part of the running rreg_state.  Later, when offered a choice between
+      rregs, it's better to choose one which is not marked as having
+      any HLRs, since ones with HLRs may need to be spilled around
+      their HLRs.  Correctness of final assignment is unaffected by
+      this mechanism -- it is only an optimisation. */
+
+   for (j = 0; j < rreg_lrs_used; j++) {
+      rreg = rreg_lrs_la[j].rreg;
+      vassert(!hregIsVirtual(rreg));
+      /* rreg is involved in a HLR.  Record this info in the array, if
+         there is space. */
+      for (k = 0; k < n_rregs; k++)
+         if (rreg_state[k].rreg == rreg)
+            break;
+      vassert(k < n_rregs); /* else rreg was not found in rreg_state?! */
+      rreg_state[k].has_hlrs = True;
+   }
+   if (0) {
+      for (j = 0; j < n_rregs; j++) {
+         if (!rreg_state[j].has_hlrs)
+            continue;
+         ppReg(rreg_state[j].rreg);
+         vex_printf(" hinted\n");
+      }
+   }
+
+   /* Finally, copy the _la variant into the _db variant and
+      sort both by their respective fields. */
+   rreg_lrs_db = LibVEX_Alloc(rreg_lrs_used * sizeof(RRegLR));
+   for (j = 0; j < rreg_lrs_used; j++)
+      rreg_lrs_db[j] = rreg_lrs_la[j];
+
+   sortRRLRarray( rreg_lrs_la, rreg_lrs_used, True /* by .live_after*/  );
+   sortRRLRarray( rreg_lrs_db, rreg_lrs_used, False/* by .dead_before*/ );
+
+   /* And set up the cursors. */
+   rreg_lrs_la_next = 0;
+   rreg_lrs_db_next = 0;
+
+   for (j = 1; j < rreg_lrs_used; j++) {
+      vassert(rreg_lrs_la[j-1].live_after  <= rreg_lrs_la[j].live_after);
+      vassert(rreg_lrs_db[j-1].dead_before <= rreg_lrs_db[j].dead_before);
+   }
+
+   /* ------ end of FINALISE RREG LIVE RANGES ------ */
+
+#  if DEBUG_REGALLOC
+   for (j = 0; j < n_vregs; j++) {
+      vex_printf("vreg %d:  la = %d,  db = %d\n", 
+                 j, vreg_lrs[j].live_after, vreg_lrs[j].dead_before );
+   }
+#  endif
+
+#  if DEBUG_REGALLOC
+   vex_printf("RRegLRs by LA:\n");
+   for (j = 0; j < rreg_lrs_used; j++) {
+      vex_printf("  ");
+      (*ppReg)(rreg_lrs_la[j].rreg);
+      vex_printf("      la = %d,  db = %d\n",
+                 rreg_lrs_la[j].live_after, rreg_lrs_la[j].dead_before );
+   }
+   vex_printf("RRegLRs by DB:\n");
+   for (j = 0; j < rreg_lrs_used; j++) {
+      vex_printf("  ");
+      (*ppReg)(rreg_lrs_db[j].rreg);
+      vex_printf("      la = %d,  db = %d\n",
+                 rreg_lrs_db[j].live_after, rreg_lrs_db[j].dead_before );
+   }
+#  endif
+
+   /* --------- Stage 3: allocate spill slots. --------- */
+
+   /* Each spill slot is 8 bytes long.  For vregs which take more than
+      64 bits to spill (classes Flt64 and Vec128), we have to allocate
+      two spill slots.
+
+      For Vec128-class on PowerPC, the spill slot's actual address
+      must be 16-byte aligned.  Since the spill slot's address is
+      computed as an offset from the guest state pointer, and since
+      the user of the generated code must set that pointer to a
+      16-aligned value, we have the residual obligation here of
+      choosing a 16-aligned spill slot offset for Vec128-class values.
+      Since each spill slot is 8 bytes long, that means for
+      Vec128-class values we must allocated a spill slot number which
+      is zero mod 2.
+
+      Do a rank-based allocation of vregs to spill slot numbers.  We
+      put as few values as possible in spill slots, but nevertheless
+      need to have a spill slot available for all vregs, just in case.
+   */
+   /* max_ss_no = -1; */
+
+   for (j = 0; j < N_SPILL64S; j++)
+      ss_busy_until_before[j] = 0;
+
+   for (j = 0; j < n_vregs; j++) {
+
+      /* True iff this vreg is unused.  In which case we also expect
+         that the reg_class field for it has not been set.  */
+      if (vreg_lrs[j].live_after == INVALID_INSTRNO) {
+         vassert(vreg_lrs[j].reg_class == HRcINVALID);
+         continue;
+      }
+
+      /* The spill slots are 64 bits in size.  As per the comment on
+         definition of HRegClass in host_generic_regs.h, that means, to
+         spill a vreg of class Flt64 or Vec128, we'll need to find two
+         adjacent spill slots to use.  Note, this logic needs to kept
+         in sync with the size info on the definition of HRegClass. */
+
+      if (vreg_lrs[j].reg_class == HRcVec128
+          || vreg_lrs[j].reg_class == HRcFlt64) {
+
+         /* Find two adjacent free slots in which between them provide
+            up to 128 bits in which to spill the vreg.  Since we are
+            trying to find an even:odd pair, move along in steps of 2
+            (slots). */
+
+         for (k = 0; k < N_SPILL64S-1; k += 2)
+            if (ss_busy_until_before[k] <= vreg_lrs[j].live_after
+                && ss_busy_until_before[k+1] <= vreg_lrs[j].live_after)
+               break;
+         if (k >= N_SPILL64S-1) {
+            vpanic("LibVEX_N_SPILL_BYTES is too low.  " 
+                   "Increase and recompile.");
+         }
+         if (0) vex_printf("16-byte spill offset in spill slot %d\n", (Int)k);
+         ss_busy_until_before[k+0] = vreg_lrs[j].dead_before;
+         ss_busy_until_before[k+1] = vreg_lrs[j].dead_before;
+
+      } else {
+
+         /* The ordinary case -- just find a single spill slot. */
+
+         /* Find the lowest-numbered spill slot which is available at
+            the start point of this interval, and assign the interval
+            to it. */
+         for (k = 0; k < N_SPILL64S; k++)
+            if (ss_busy_until_before[k] <= vreg_lrs[j].live_after)
+               break;
+         if (k == N_SPILL64S) {
+            vpanic("LibVEX_N_SPILL_BYTES is too low.  " 
+                   "Increase and recompile.");
+         }
+         ss_busy_until_before[k] = vreg_lrs[j].dead_before;
+
+      }
+
+      /* This reflects LibVEX's hard-wired knowledge of the baseBlock
+         layout: the guest state, then two equal sized areas following
+         it for two sets of shadow state, and then the spill area. */
+      vreg_lrs[j].spill_offset = toShort(guest_sizeB * 3 + k * 8);
+
+      /* Independent check that we've made a sane choice of slot */
+      sanity_check_spill_offset( &vreg_lrs[j] );
+      /* if (j > max_ss_no) */
+      /*    max_ss_no = j; */
+   }
+
+#  if 0
+   vex_printf("\n\n");
+   for (j = 0; j < n_vregs; j++)
+      vex_printf("vreg %d    --> spill offset %d\n",
+                 j, vreg_lrs[j].spill_offset);
+#  endif
+
+   /* --------- Stage 4: establish rreg preferences --------- */
+
+   /* It may be advantageous to allocating certain vregs to specific
+      rregs, as a way of avoiding reg-reg moves later.  Here we
+      establish which, if any, rreg each vreg would prefer to be in.
+      Note that this constrains the allocator -- ideally we end up
+      with as few as possible vregs expressing a preference.  
+
+      This is an optimisation: if the .preferred_rreg field is never
+      set to anything different from INVALID_HREG, the allocator still
+      works. */
+
+   /* 30 Dec 04: removed this mechanism as it does not seem to
+      help. */
+
+   /* --------- Stage 5: process instructions --------- */
+
+   /* This is the main loop of the allocator.  First, we need to
+      correctly set up our running state, which tracks the status of
+      each real register. */
+
+   /* ------ BEGIN: Process each insn in turn. ------ */
+
+   for (ii = 0; ii < instrs_in->arr_used; ii++) {
+
+#     if DEBUG_REGALLOC
+      vex_printf("\n====----====---- Insn %d ----====----====\n", ii);
+      vex_printf("---- ");
+      (*ppInstr)(instrs_in->arr[ii], mode64);
+      vex_printf("\n\nInitial state:\n");
+      PRINT_STATE;
+      vex_printf("\n");
+#     endif
+
+      /* ------------ Sanity checks ------------ */
+
+      /* Sanity checks are expensive.  So they are done only once
+         every 7 instructions, and just before the last
+         instruction. */
+      do_sanity_check
+         = toBool(
+              False  /* Set to True for sanity checking of all insns. */
+              || ii == instrs_in->arr_used-1
+              || (ii > 0 && (ii % 7) == 0)
+           );
+
+      if (do_sanity_check) {
+
+         /* Sanity check 1: all rregs with a hard live range crossing
+            this insn must be marked as unavailable in the running
+            state. */
+         for (j = 0; j < rreg_lrs_used; j++) {
+            if (rreg_lrs_la[j].live_after < ii 
+                && ii < rreg_lrs_la[j].dead_before) {
+               /* ii is the middle of a hard live range for some real
+                  reg.  Check it's marked as such in the running
+                  state. */
+
+#              if 0
+               vex_printf("considering la %d .. db %d   reg = ", 
+                          rreg_lrs[j].live_after, 
+                          rreg_lrs[j].dead_before);
+               (*ppReg)(rreg_lrs[j].rreg);
+               vex_printf("\n");
+#              endif
+
+               /* find the state entry for this rreg */
+               for (k = 0; k < n_rregs; k++)
+                  if (rreg_state[k].rreg == rreg_lrs_la[j].rreg)
+                     break;
+
+               /* and assert that this rreg is marked as unavailable */
+               vassert(rreg_state[k].disp == Unavail);
+            }
+         }
+
+         /* Sanity check 2: conversely, all rregs marked as
+            unavailable in the running rreg_state must have a
+            corresponding hard live range entry in the rreg_lrs
+            array. */
+         for (j = 0; j < n_available_real_regs; j++) {
+            vassert(rreg_state[j].disp == Bound
+                    || rreg_state[j].disp == Free
+                    || rreg_state[j].disp == Unavail);
+            if (rreg_state[j].disp != Unavail)
+               continue;
+            for (k = 0; k < rreg_lrs_used; k++) 
+               if (rreg_lrs_la[k].rreg == rreg_state[j].rreg
+                   && rreg_lrs_la[k].live_after < ii 
+                   && ii < rreg_lrs_la[k].dead_before) 
+                  break;
+            /* If this vassertion fails, we couldn't find a
+               corresponding HLR. */
+            vassert(k < rreg_lrs_used);
+         }
+
+         /* Sanity check 3: all vreg-rreg bindings must bind registers
+            of the same class. */
+         for (j = 0; j < n_rregs; j++) {
+            if (rreg_state[j].disp != Bound) {
+               vassert(rreg_state[j].eq_spill_slot == False);
+               continue;
+            }
+            vassert(hregClass(rreg_state[j].rreg) 
+                    == hregClass(rreg_state[j].vreg));
+            vassert( hregIsVirtual(rreg_state[j].vreg));
+            vassert(!hregIsVirtual(rreg_state[j].rreg));
+         }
+
+         /* Sanity check 4: the vreg_state and rreg_state
+            mutually-redundant mappings are consistent.  If
+            rreg_state[j].vreg points at some vreg_state entry then
+            that vreg_state entry should point back at
+            rreg_state[j]. */
+         for (j = 0; j < n_rregs; j++) {
+            if (rreg_state[j].disp != Bound)
+               continue;
+            k = hregNumber(rreg_state[j].vreg);
+            vassert(IS_VALID_VREGNO(k));
+            vassert(vreg_state[k] == j);
+         }
+         for (j = 0; j < n_vregs; j++) {
+            k = vreg_state[j];
+            if (k == INVALID_RREG_NO)
+               continue;
+            vassert(IS_VALID_RREGNO(k));
+            vassert(rreg_state[k].disp == Bound);
+            vassert(hregNumber(rreg_state[k].vreg) == j);
+         }
+
+      } /* if (do_sanity_check) */
+
+      /* ------------ end of Sanity checks ------------ */
+
+      /* Do various optimisations pertaining to register coalescing
+         and preferencing:
+            MOV  v <-> v   coalescing (done here).
+            MOV  v <-> r   coalescing (not yet, if ever)
+      */
+      /* If doing a reg-reg move between two vregs, and the src's live
+         range ends here and the dst's live range starts here, bind
+         the dst to the src's rreg, and that's all. */
+      if ( (*isMove)( instrs_in->arr[ii], &vregS, &vregD ) ) {
+         if (!hregIsVirtual(vregS)) goto cannot_coalesce;
+         if (!hregIsVirtual(vregD)) goto cannot_coalesce;
+         /* Check that *isMove is not telling us a bunch of lies ... */
+         vassert(hregClass(vregS) == hregClass(vregD));
+         k = hregNumber(vregS);
+         m = hregNumber(vregD);
+         vassert(IS_VALID_VREGNO(k));
+         vassert(IS_VALID_VREGNO(m));
+         if (vreg_lrs[k].dead_before != ii + 1) goto cannot_coalesce;
+         if (vreg_lrs[m].live_after != ii) goto cannot_coalesce;
+#        if DEBUG_REGALLOC
+         vex_printf("COALESCE ");
+         (*ppReg)(vregS);
+         vex_printf(" -> ");
+         (*ppReg)(vregD);
+         vex_printf("\n\n");
+#        endif
+         /* Find the state entry for vregS. */
+         for (m = 0; m < n_rregs; m++)
+            if (rreg_state[m].disp == Bound && rreg_state[m].vreg == vregS)
+               break;
+         if (m == n_rregs)
+            /* We failed to find a binding for vregS, which means it's
+               currently not in a register.  So we can't do the
+               coalescing.  Give up. */
+            goto cannot_coalesce;
+
+         /* Finally, we can do the coalescing.  It's trivial -- merely
+            claim vregS's register for vregD. */
+         rreg_state[m].vreg = vregD;
+         vassert(IS_VALID_VREGNO(hregNumber(vregD)));
+         vassert(IS_VALID_VREGNO(hregNumber(vregS)));
+         vreg_state[hregNumber(vregD)] = toShort(m);
+         vreg_state[hregNumber(vregS)] = INVALID_RREG_NO;
+
+         /* This rreg has become associated with a different vreg and
+            hence with a different spill slot.  Play safe. */
+         rreg_state[m].eq_spill_slot = False;
+
+         /* Move on to the next insn.  We skip the post-insn stuff for
+            fixed registers, since this move should not interact with
+            them in any way. */
+         continue;
+      }
+     cannot_coalesce:
+
+      /* ------ Free up rregs bound to dead vregs ------ */
+
+      /* Look for vregs whose live range has just ended, and 
+	 mark the associated rreg as free. */
+
+      for (j = 0; j < n_rregs; j++) {
+         if (rreg_state[j].disp != Bound)
+            continue;
+         vreg = hregNumber(rreg_state[j].vreg);
+         vassert(IS_VALID_VREGNO(vreg));
+         if (vreg_lrs[vreg].dead_before <= ii) {
+            rreg_state[j].disp = Free;
+            rreg_state[j].eq_spill_slot = False;
+            m = hregNumber(rreg_state[j].vreg);
+            vassert(IS_VALID_VREGNO(m));
+            vreg_state[m] = INVALID_RREG_NO;
+            if (DEBUG_REGALLOC) {
+               vex_printf("free up "); 
+               (*ppReg)(rreg_state[j].rreg); 
+               vex_printf("\n");
+            }
+         }
+      }
+
+      /* ------ Pre-instruction actions for fixed rreg uses ------ */
+
+      /* Now we have to deal with rregs which are about to be made
+         live by this instruction -- in other words, are entering into
+         one of their live ranges.  If any such rreg holds a vreg, we
+         will have to free up the rreg.  The simplest solution which
+         is correct is to spill the rreg.
+
+         Note we could do better:
+         * Could move it into some other free rreg, if one is available 
+
+         Do this efficiently, by incrementally stepping along an array
+         of rreg HLRs that are known to be sorted by start point
+         (their .live_after field).
+      */
+      while (True) {
+         vassert(rreg_lrs_la_next >= 0);
+         vassert(rreg_lrs_la_next <= rreg_lrs_used);
+         if (rreg_lrs_la_next == rreg_lrs_used)
+            break; /* no more real reg live ranges to consider */
+         if (ii < rreg_lrs_la[rreg_lrs_la_next].live_after)
+            break; /* next live range does not yet start */
+         vassert(ii == rreg_lrs_la[rreg_lrs_la_next].live_after);
+         /* rreg_lrs_la[rreg_lrs_la_next].rreg needs to be freed up.
+            Find the associated rreg_state entry. */
+         /* Note, re ii == rreg_lrs_la[rreg_lrs_la_next].live_after.
+            Real register live ranges are guaranteed to be well-formed
+            in that they start with a write to the register -- Stage 2
+            rejects any code not satisfying this.  So the correct
+            question to ask is whether
+            rreg_lrs_la[rreg_lrs_la_next].live_after == ii, that is,
+            whether the reg becomes live after this insn -- rather
+            than before it. */
+#        if DEBUG_REGALLOC
+         vex_printf("need to free up rreg: ");
+         (*ppReg)(rreg_lrs_la[rreg_lrs_la_next].rreg);
+         vex_printf("\n\n");
+#        endif
+         for (k = 0; k < n_rregs; k++)
+            if (rreg_state[k].rreg == rreg_lrs_la[rreg_lrs_la_next].rreg)
+               break;
+         /* If this fails, we don't have an entry for this rreg.
+            Which we should. */
+         vassert(IS_VALID_RREGNO(k));
+         m = hregNumber(rreg_state[k].vreg);
+         if (rreg_state[k].disp == Bound) {
+            /* Yes, there is an associated vreg.  Spill it if it's
+               still live. */
+            vassert(IS_VALID_VREGNO(m));
+            vreg_state[m] = INVALID_RREG_NO;
+            if (vreg_lrs[m].dead_before > ii) {
+               vassert(vreg_lrs[m].reg_class != HRcINVALID);
+               if ((!eq_spill_opt) || !rreg_state[k].eq_spill_slot) {
+                  HInstr* spill1 = NULL;
+                  HInstr* spill2 = NULL;
+                  (*genSpill)( &spill1, &spill2, rreg_state[k].rreg,
+                               vreg_lrs[m].spill_offset, mode64 );
+                  vassert(spill1 || spill2); /* can't both be NULL */
+                  if (spill1)
+                     EMIT_INSTR(spill1);
+                  if (spill2)
+                     EMIT_INSTR(spill2);
+               }
+               rreg_state[k].eq_spill_slot = True;
+            }
+         }
+         rreg_state[k].disp = Unavail;
+         rreg_state[k].vreg = INVALID_HREG;
+         rreg_state[k].eq_spill_slot = False;
+
+         /* check for further rregs entering HLRs at this point */
+         rreg_lrs_la_next++;
+      }
+
+
+#     if DEBUG_REGALLOC
+      vex_printf("After pre-insn actions for fixed regs:\n");
+      PRINT_STATE;
+      vex_printf("\n");
+#     endif
+
+
+      /* ------ Deal with the current instruction. ------ */
+
+      /* Finally we can begin the processing of this instruction
+         itself.  The aim is to free up enough rregs for this insn.
+         This may generate spill stores since we may have to evict
+         some vregs currently in rregs.  Also generates spill loads.
+         We also build up the final vreg->rreg mapping to be applied
+         to the insn. */
+      
+      (*getRegUsage)( &reg_usage, instrs_in->arr[ii], mode64 );
+
+      initHRegRemap(&remap);
+
+      /* ------------ BEGIN directReload optimisation ----------- */
+
+      /* If the instruction reads exactly one vreg which is currently
+         in a spill slot, and this is last use of that vreg, see if we
+         can convert the instruction into one reads directly from the
+         spill slot.  This is clearly only possible for x86 and amd64
+         targets, since ppc and arm load-store architectures.  If
+         successful, replace instrs_in->arr[ii] with this new
+         instruction, and recompute its reg usage, so that the change
+         is invisible to the standard-case handling that follows. */
+      
+      if (directReload && reg_usage.n_used <= 2) { 
+         Bool  debug_direct_reload = True && False;
+         HReg  cand     = INVALID_HREG;
+         Bool  nreads   = 0;
+         Short spilloff = 0;
+
+         for (j = 0; j < reg_usage.n_used; j++) {
+
+            vreg = reg_usage.hreg[j];
+
+            if (!hregIsVirtual(vreg)) 
+               continue;
+
+            if (reg_usage.mode[j] == HRmRead) {
+               nreads++;
+               m = hregNumber(vreg);
+               vassert(IS_VALID_VREGNO(m));
+               k = vreg_state[m];
+               if (!IS_VALID_RREGNO(k)) {
+                  /* ok, it is spilled.  Now, is this its last use? */
+                  vassert(vreg_lrs[m].dead_before >= ii+1);
+                  if (vreg_lrs[m].dead_before == ii+1
+                      && cand == INVALID_HREG) {
+                     spilloff = vreg_lrs[m].spill_offset;
+                     cand = vreg;
+                  }
+               }
+            }
+         }
+
+         if (nreads == 1 && cand != INVALID_HREG) {
+            HInstr* reloaded;
+            if (reg_usage.n_used == 2)
+               vassert(reg_usage.hreg[0] != reg_usage.hreg[1]);
+
+            reloaded = directReload ( instrs_in->arr[ii], cand, spilloff );
+            if (debug_direct_reload && !reloaded) {
+               vex_printf("[%3d] ", spilloff); ppHReg(cand); vex_printf(" "); 
+               ppInstr(instrs_in->arr[ii], mode64); 
+            }
+            if (reloaded) {
+               /* Update info about the insn, so it looks as if it had
+                  been in this form all along. */
+               instrs_in->arr[ii] = reloaded;
+               (*getRegUsage)( &reg_usage, instrs_in->arr[ii], mode64 );
+               if (debug_direct_reload && !reloaded) {
+                  vex_printf("  -->  ");
+                  ppInstr(reloaded, mode64);
+               }
+            }
+
+            if (debug_direct_reload && !reloaded)
+               vex_printf("\n");
+         }
+
+      }
+
+      /* ------------ END directReload optimisation ------------ */
+
+      /* for each reg mentioned in the insn ... */
+      for (j = 0; j < reg_usage.n_used; j++) {
+
+         vreg = reg_usage.hreg[j];
+
+         /* only interested in virtual registers right now. */
+         if (!hregIsVirtual(vreg)) 
+            continue;
+
+#        if 0
+         vex_printf("considering "); (*ppReg)(vreg); vex_printf("\n");
+#        endif
+
+         /* Now we're trying to find a rreg for "vreg".  First of all,
+            if it already has an rreg assigned, we don't need to do
+            anything more.  Search the current state to find out. */
+         m = hregNumber(vreg);
+         vassert(IS_VALID_VREGNO(m));
+         k = vreg_state[m];
+         if (IS_VALID_RREGNO(k)) {
+            vassert(rreg_state[k].disp == Bound);
+            addToHRegRemap(&remap, vreg, rreg_state[k].rreg);
+            /* If this rreg is written or modified, mark it as different
+               from any spill slot value. */
+            if (reg_usage.mode[j] != HRmRead)
+               rreg_state[k].eq_spill_slot = False;
+            continue;
+         } else {
+            vassert(k == INVALID_RREG_NO);
+         }
+
+         /* No luck.  The next thing to do is see if there is a
+            currently free rreg available, of the correct class.  If
+            so, bag it.  NOTE, we could improve this by selecting an
+            rreg for which the next live-range event is as far ahead
+            as possible. */
+         k_suboptimal = -1;
+         for (k = 0; k < n_rregs; k++) {
+            if (rreg_state[k].disp != Free
+                || hregClass(rreg_state[k].rreg) != hregClass(vreg))
+               continue;
+            if (rreg_state[k].has_hlrs) {
+               /* Well, at least we can use k_suboptimal if we really
+                  have to.  Keep on looking for a better candidate. */
+               k_suboptimal = k;
+            } else {
+               /* Found a preferable reg.  Use it. */
+               k_suboptimal = -1;
+               break;
+            }
+         }
+         if (k_suboptimal >= 0)
+            k = k_suboptimal;
+
+         if (k < n_rregs) {
+            rreg_state[k].disp = Bound;
+            rreg_state[k].vreg = vreg;
+            m = hregNumber(vreg);
+            vassert(IS_VALID_VREGNO(m));
+            vreg_state[m] = toShort(k);
+            addToHRegRemap(&remap, vreg, rreg_state[k].rreg);
+            /* Generate a reload if needed.  This only creates needed
+               reloads because the live range builder for vregs will
+               guarantee that the first event for a vreg is a write.
+               Hence, if this reference is not a write, it cannot be
+               the first reference for this vreg, and so a reload is
+               indeed needed. */
+            if (reg_usage.mode[j] != HRmWrite) {
+               vassert(vreg_lrs[m].reg_class != HRcINVALID);
+               HInstr* reload1 = NULL;
+               HInstr* reload2 = NULL;
+               (*genReload)( &reload1, &reload2, rreg_state[k].rreg,
+                             vreg_lrs[m].spill_offset, mode64 );
+               vassert(reload1 || reload2); /* can't both be NULL */
+               if (reload1)
+                  EMIT_INSTR(reload1);
+               if (reload2)
+                  EMIT_INSTR(reload2);
+               /* This rreg is read or modified by the instruction.
+                  If it's merely read we can claim it now equals the
+                  spill slot, but not so if it is modified. */
+               if (reg_usage.mode[j] == HRmRead) {
+                  rreg_state[k].eq_spill_slot = True;
+               } else {
+                  vassert(reg_usage.mode[j] == HRmModify);
+                  rreg_state[k].eq_spill_slot = False;
+               }
+            } else {
+               rreg_state[k].eq_spill_slot = False;
+            }
+
+            continue;
+         }
+
+         /* Well, now we have no option but to spill a vreg.  It's
+            important to make a good choice of vreg to spill, and of
+            course we need to be careful not to spill a vreg which is
+            needed by this insn. */
+
+         /* First, mark in the rreg_state, those rregs which are not spill
+            candidates, due to holding a vreg mentioned by this
+            instruction.  Or being of the wrong class. */
+         for (k = 0; k < n_rregs; k++) {
+            rreg_state[k].is_spill_cand = False;
+            if (rreg_state[k].disp != Bound)
+               continue;
+            if (hregClass(rreg_state[k].rreg) != hregClass(vreg))
+               continue;
+            rreg_state[k].is_spill_cand = True;
+            for (m = 0; m < reg_usage.n_used; m++) {
+               if (rreg_state[k].vreg == reg_usage.hreg[m]) {
+                  rreg_state[k].is_spill_cand = False;
+                  break;
+               }
+            }
+         }
+
+         /* We can choose to spill any rreg satisfying
+            rreg_state[r].is_spill_cand (so to speak).  Choose r so that
+            the next use of its associated vreg is as far ahead as
+            possible, in the hope that this will minimise the number
+            of consequent reloads required. */
+         spillee
+            = findMostDistantlyMentionedVReg ( 
+                 getRegUsage, instrs_in, ii+1, rreg_state, n_rregs, mode64 );
+
+         if (spillee == -1) {
+            /* Hmmmmm.  There don't appear to be any spill candidates.
+               We're hosed. */
+            vex_printf("reg_alloc: can't find a register in class: ");
+            ppHRegClass(hregClass(vreg));
+            vex_printf("\n");
+            vpanic("reg_alloc: can't create a free register.");
+         }
+
+         /* Right.  So we're going to spill rreg_state[spillee]. */
+         vassert(IS_VALID_RREGNO(spillee));
+         vassert(rreg_state[spillee].disp == Bound);
+         /* check it's the right class */
+         vassert(hregClass(rreg_state[spillee].rreg) == hregClass(vreg));
+         /* check we're not ejecting the vreg for which we are trying
+            to free up a register. */
+         vassert(rreg_state[spillee].vreg != vreg);
+
+         m = hregNumber(rreg_state[spillee].vreg);
+         vassert(IS_VALID_VREGNO(m));
+
+         /* So here's the spill store.  Assert that we're spilling a
+            live vreg. */
+         vassert(vreg_lrs[m].dead_before > ii);
+         vassert(vreg_lrs[m].reg_class != HRcINVALID);
+         if ((!eq_spill_opt) || !rreg_state[spillee].eq_spill_slot) {
+            HInstr* spill1 = NULL;
+            HInstr* spill2 = NULL;
+            (*genSpill)( &spill1, &spill2, rreg_state[spillee].rreg,
+                         vreg_lrs[m].spill_offset, mode64 );
+            vassert(spill1 || spill2); /* can't both be NULL */
+            if (spill1)
+               EMIT_INSTR(spill1);
+            if (spill2)
+               EMIT_INSTR(spill2);
+         }
+
+         /* Update the rreg_state to reflect the new assignment for this
+            rreg. */
+         rreg_state[spillee].vreg = vreg;
+         vreg_state[m] = INVALID_RREG_NO;
+
+         rreg_state[spillee].eq_spill_slot = False; /* be safe */
+
+         m = hregNumber(vreg);
+         vassert(IS_VALID_VREGNO(m));
+         vreg_state[m] = toShort(spillee);
+
+         /* Now, if this vreg is being read or modified (as opposed to
+            written), we have to generate a reload for it. */
+         if (reg_usage.mode[j] != HRmWrite) {
+            vassert(vreg_lrs[m].reg_class != HRcINVALID);
+            HInstr* reload1 = NULL;
+            HInstr* reload2 = NULL;
+            (*genReload)( &reload1, &reload2, rreg_state[spillee].rreg,
+                          vreg_lrs[m].spill_offset, mode64 );
+            vassert(reload1 || reload2); /* can't both be NULL */
+            if (reload1)
+               EMIT_INSTR(reload1);
+            if (reload2)
+               EMIT_INSTR(reload2);
+            /* This rreg is read or modified by the instruction.
+               If it's merely read we can claim it now equals the
+               spill slot, but not so if it is modified. */
+            if (reg_usage.mode[j] == HRmRead) {
+               rreg_state[spillee].eq_spill_slot = True;
+            } else {
+               vassert(reg_usage.mode[j] == HRmModify);
+               rreg_state[spillee].eq_spill_slot = False;
+            }
+         }
+
+         /* So after much twisting and turning, we have vreg mapped to
+            rreg_state[spillee].rreg.  Note that in the map. */
+         addToHRegRemap(&remap, vreg, rreg_state[spillee].rreg);
+
+      } /* iterate over registers in this instruction. */
+
+      /* We've finished clowning around with registers in this instruction.
+         Three results:
+         - the running rreg_state[] has been updated
+         - a suitable vreg->rreg mapping for this instruction has been 
+           constructed
+         - spill and reload instructions may have been emitted.
+
+        The final step is to apply the mapping to the instruction, 
+        and emit that.
+      */
+
+      /* NOTE, DESTRUCTIVELY MODIFIES instrs_in->arr[ii]. */
+      (*mapRegs)( &remap, instrs_in->arr[ii], mode64 );
+      EMIT_INSTR( instrs_in->arr[ii] );
+
+#     if DEBUG_REGALLOC
+      vex_printf("After dealing with current insn:\n");
+      PRINT_STATE;
+      vex_printf("\n");
+#     endif
+
+      /* ------ Post-instruction actions for fixed rreg uses ------ */
+
+      /* Now we need to check for rregs exiting fixed live ranges
+         after this instruction, and if so mark them as free. */
+      while (True) {
+         vassert(rreg_lrs_db_next >= 0);
+         vassert(rreg_lrs_db_next <= rreg_lrs_used);
+         if (rreg_lrs_db_next == rreg_lrs_used)
+            break; /* no more real reg live ranges to consider */
+         if (ii+1 < rreg_lrs_db[rreg_lrs_db_next].dead_before)
+            break; /* next live range does not yet start */
+         vassert(ii+1 == rreg_lrs_db[rreg_lrs_db_next].dead_before);
+         /* rreg_lrs_db[[rreg_lrs_db_next].rreg is exiting a hard live
+            range.  Mark it as such in the main rreg_state array. */
+         for (k = 0; k < n_rregs; k++)
+            if (rreg_state[k].rreg == rreg_lrs_db[rreg_lrs_db_next].rreg)
+               break;
+         /* If this vassertion fails, we don't have an entry for
+            this rreg.  Which we should. */
+         vassert(k < n_rregs);
+         vassert(rreg_state[k].disp == Unavail);
+         rreg_state[k].disp = Free;
+         rreg_state[k].vreg = INVALID_HREG;
+         rreg_state[k].eq_spill_slot = False;
+
+         /* check for further rregs leaving HLRs at this point */
+         rreg_lrs_db_next++;
+      }
+
+#     if DEBUG_REGALLOC
+      vex_printf("After post-insn actions for fixed regs:\n");
+      PRINT_STATE;
+      vex_printf("\n");
+#     endif
+
+   } /* iterate over insns */
+
+   /* ------ END: Process each insn in turn. ------ */
+
+   /* free(rreg_state); */
+   /* free(rreg_lrs); */
+   /* if (vreg_lrs) free(vreg_lrs); */
+
+   /* Paranoia */
+   for (j = 0; j < n_rregs; j++)
+      vassert(rreg_state[j].rreg == available_real_regs[j]);
+
+   vassert(rreg_lrs_la_next == rreg_lrs_used);
+   vassert(rreg_lrs_db_next == rreg_lrs_used);
+
+   return instrs_out;
+
+#  undef INVALID_INSTRNO
+#  undef EMIT_INSTR
+#  undef PRINT_STATE
+}
+
+
+
+/*---------------------------------------------------------------*/
+/*---                                       host_reg_alloc2.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_generic_regs.c b/VEX/priv/host_generic_regs.c
new file mode 100644
index 0000000..e36b4dc
--- /dev/null
+++ b/VEX/priv/host_generic_regs.c

@@ -0,0 +1,223 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                               host_generic_regs.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex.h"
+
+#include "main_util.h"
+#include "host_generic_regs.h"
+
+
+void ppHRegClass ( HRegClass hrc )
+{
+   switch (hrc) {
+      case HRcInt32:   vex_printf("HRcInt32"); break;
+      case HRcInt64:   vex_printf("HRcInt64"); break;
+      case HRcFlt32:   vex_printf("HRcFlt32"); break;
+      case HRcFlt64:   vex_printf("HRcFlt64"); break;
+      case HRcVec64:   vex_printf("HRcVec64"); break;
+      case HRcVec128:  vex_printf("HRcVec128"); break;
+      default: vpanic("ppHRegClass");
+   }
+}
+
+/* Generic printing for registers. */
+void ppHReg ( HReg r ) 
+{
+   HChar* maybe_v = hregIsVirtual(r) ? "v" : "";
+   Int    regNo   = hregNumber(r);
+   switch (hregClass(r)) {
+      case HRcInt32:   vex_printf("%%%sr%d", maybe_v, regNo); return;
+      case HRcInt64:   vex_printf("%%%sR%d", maybe_v, regNo); return;
+      case HRcFlt32:   vex_printf("%%%sF%d", maybe_v, regNo); return;
+      case HRcFlt64:   vex_printf("%%%sD%d", maybe_v, regNo); return;
+      case HRcVec64:   vex_printf("%%%sv%d", maybe_v, regNo); return;
+      case HRcVec128:  vex_printf("%%%sV%d", maybe_v, regNo); return;
+      default: vpanic("ppHReg");
+   }
+}
+
+
+/*---------------------------------------------------------*/
+/*--- Helpers for recording reg usage (for reg-alloc)   ---*/
+/*---------------------------------------------------------*/
+
+void ppHRegUsage ( HRegUsage* tab )
+{
+   Int    i;
+   HChar* str;
+   vex_printf("HRegUsage {\n");
+   for (i = 0; i < tab->n_used; i++) {
+      switch (tab->mode[i]) {
+         case HRmRead:   str = "Read   "; break;
+         case HRmWrite:  str = "Write  "; break;
+         case HRmModify: str = "Modify "; break;
+         default: vpanic("ppHRegUsage");
+      }
+      vex_printf("   %s ", str);
+      ppHReg(tab->hreg[i]);
+      vex_printf("\n");
+   }
+   vex_printf("}\n");
+}
+
+
+/* Add a register to a usage table.  Combine incoming read uses with
+   existing write uses into a modify use, and vice versa.  Do not
+   create duplicate entries -- each reg should only be mentioned once.  
+*/
+void addHRegUse ( HRegUsage* tab, HRegMode mode, HReg reg )
+{
+   Int i;
+   /* Find it ... */
+   for (i = 0; i < tab->n_used; i++)
+      if (tab->hreg[i] == reg)
+         break;
+   if (i == tab->n_used) {
+      /* Not found, add new entry. */
+      vassert(tab->n_used < N_HREG_USAGE);
+      tab->hreg[tab->n_used] = reg;
+      tab->mode[tab->n_used] = mode;
+      tab->n_used++;
+   } else {
+      /* Found: combine or ignore. */
+      /* This is a greatest-lower-bound operation in the poset:
+
+            R   W
+             \ /
+              M
+
+         Need to do: tab->mode[i] = GLB(tab->mode, mode).  In this
+         case very simple -- if tab->mode[i] != mode then result must
+         be M.
+      */
+      if (tab->mode[i] == mode) {
+         /* duplicate, ignore */
+      } else {
+         tab->mode[i] = HRmModify;
+      }
+   }
+}
+
+
+/*---------------------------------------------------------*/
+/*--- Indicating register remappings (for reg-alloc)    ---*/
+/*---------------------------------------------------------*/
+
+void ppHRegRemap ( HRegRemap* map )
+{
+   Int   i;
+   vex_printf("HRegRemap {\n");
+   for (i = 0; i < map->n_used; i++) {
+      vex_printf("   ");
+      ppHReg(map->orig[i]);
+      vex_printf("  -->  ");
+      ppHReg(map->replacement[i]);
+      vex_printf("\n");
+   }
+   vex_printf("}\n");
+}
+
+
+void initHRegRemap ( HRegRemap* map )
+{
+   map->n_used = 0;
+}
+
+
+void addToHRegRemap ( HRegRemap* map, HReg orig, HReg replacement )
+{
+   Int i;
+   for (i = 0; i < map->n_used; i++)
+      if (map->orig[i] == orig)
+         vpanic("addToHRegMap: duplicate entry");
+   if (!hregIsVirtual(orig))
+      vpanic("addToHRegMap: orig is not a vreg");
+   if (hregIsVirtual(replacement))
+      vpanic("addToHRegMap: replacement is not a vreg");
+
+   vassert(map->n_used+1 < N_HREG_REMAP);
+   map->orig[map->n_used]        = orig;
+   map->replacement[map->n_used] = replacement;
+   map->n_used++;
+}
+
+
+HReg lookupHRegRemap ( HRegRemap* map, HReg orig )
+{
+   Int i;
+   if (!hregIsVirtual(orig))
+      return orig;
+   for (i = 0; i < map->n_used; i++)
+      if (map->orig[i] == orig)
+         return map->replacement[i];
+   vpanic("lookupHRegRemap: not found");
+}
+
+/*---------------------------------------------------------*/
+/*--- Abstract instructions                             ---*/
+/*---------------------------------------------------------*/
+
+HInstrArray* newHInstrArray ( void )
+{
+   HInstrArray* ha = LibVEX_Alloc(sizeof(HInstrArray));
+   ha->arr_size = 4;
+   ha->arr_used = 0;
+   ha->arr      = LibVEX_Alloc(ha->arr_size * sizeof(HInstr*));
+   ha->n_vregs  = 0;
+   return ha;
+}
+
+void addHInstr ( HInstrArray* ha, HInstr* instr )
+{
+   vassert(ha->arr_used <= ha->arr_size);
+   if (ha->arr_used < ha->arr_size) {
+      ha->arr[ha->arr_used] = instr;
+      ha->arr_used++;
+   } else {
+      Int      i;
+      HInstr** arr2 = LibVEX_Alloc(ha->arr_size * 2 * sizeof(HInstr*));
+      for (i = 0; i < ha->arr_size; i++)
+         arr2[i] = ha->arr[i];
+      ha->arr_size *= 2;
+      ha->arr = arr2;
+      addHInstr(ha, instr);
+   }
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                 host_generic_regs.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_generic_regs.h b/VEX/priv/host_generic_regs.h
new file mode 100644
index 0000000..1c6826c
--- /dev/null
+++ b/VEX/priv/host_generic_regs.h

@@ -0,0 +1,281 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                               host_generic_regs.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#ifndef __VEX_HOST_GENERIC_REGS_H
+#define __VEX_HOST_GENERIC_REGS_H
+
+#include "libvex_basictypes.h"
+
+
+/*---------------------------------------------------------*/
+/*--- Representing HOST REGISTERS                       ---*/
+/*---------------------------------------------------------*/
+
+/* Host registers.  Stuff to represent:
+
+   - The register number
+   - The register class
+   - Whether or not the register is a virtual reg.
+
+   Registers are a 32-bit Int, thusly:
+
+     bits 31-28  are the register class.
+     bits 27-23  are 0000b for real register, 0001b for virtual register
+     bits 23-0   register number
+
+   Note (importantly) that by arranging that the class field is never
+   0000b, any valid register looks like an extremely large int -- at
+   least 2^28 -- and so there is little chance of confusing it with an
+   integer array index in the register allocator.
+
+   Note further that since the class field is never 1111b, no valid
+   register can have the value INVALID_HREG.
+
+   There are currently 6 register classes:
+
+     int32 int64 float32 float64 simd64 simd128
+*/
+
+typedef UInt HReg;
+
+/* When extending this, do not use any value > 14 or < 0. */
+/* HRegClass describes host register classes which the instruction
+   selectors can speak about.  We would not expect all of them to be
+   available on any specific host.  For example on x86, the available
+   classes are: Int32, Flt64, Vec128 only.
+
+   IMPORTANT NOTE: host_generic_reg_alloc2.c needs how much space is
+   needed to spill each class of register.  It allocates the following
+   amount of space:
+
+      HRcInt32     64 bits
+      HRcInt64     64 bits
+      HRcFlt32     64 bits
+      HRcFlt64     128 bits (on x86 these are spilled by fstpt/fldt and
+                             so won't fit in a 64-bit slot)
+      HRcVec64     64 bits
+      HRcVec128    128 bits
+
+   If you add another regclass, you must remember to update
+   host_generic_reg_alloc2.c accordingly.
+*/
+typedef
+   enum { 
+      HRcINVALID=1,   /* NOT A VALID REGISTER CLASS */
+      HRcInt32=3,     /* 32-bit int */
+      HRcInt64=4,     /* 64-bit int */
+      HRcFlt32=5,     /* 32-bit float */
+      HRcFlt64=6,     /* 64-bit float */
+      HRcVec64=7,     /* 64-bit SIMD */
+      HRcVec128=8     /* 128-bit SIMD */
+   }
+   HRegClass;
+
+extern void ppHRegClass ( HRegClass );
+
+
+/* Print an HReg in a generic (non-target-specific) way. */
+extern void ppHReg ( HReg );
+
+/* Construct/destruct. */
+static inline HReg mkHReg ( UInt regno, HRegClass rc, Bool virtual ) {
+   UInt r24 = regno & 0x00FFFFFF;
+   /* This is critical.  The register number field may only
+      occupy 24 bits. */
+   if (r24 != regno)
+      vpanic("mkHReg: regno exceeds 2^24");
+   return regno | (((UInt)rc) << 28) | (virtual ? (1<<24) : 0);
+}
+
+static inline HRegClass hregClass ( HReg r ) {
+   UInt rc = r;
+   rc = (rc >> 28) & 0x0F;
+   vassert(rc >= HRcInt32 && rc <= HRcVec128);
+   return (HRegClass)rc;
+}
+
+static inline UInt hregNumber ( HReg r ) {
+   return ((UInt)r) & 0x00FFFFFF;
+}
+
+static inline Bool hregIsVirtual ( HReg r ) {
+   return toBool(((UInt)r) & (1<<24));
+}
+
+
+
+
+#define INVALID_HREG ((HReg)0xFFFFFFFF)
+
+
+/*---------------------------------------------------------*/
+/*--- Recording register usage (for reg-alloc)          ---*/
+/*---------------------------------------------------------*/
+
+typedef
+   enum { HRmRead, HRmWrite, HRmModify }
+   HRegMode;
+
+
+/* A struct for recording the usage of registers in instructions.
+   This can get quite large, but we don't expect to allocate them
+   dynamically, so there's no problem. 
+*/
+#define N_HREG_USAGE 25
+
+typedef
+   struct {
+      HReg     hreg[N_HREG_USAGE];
+      HRegMode mode[N_HREG_USAGE];
+      Int      n_used;
+   }
+   HRegUsage;
+
+extern void ppHRegUsage ( HRegUsage* );
+
+static inline void initHRegUsage ( HRegUsage* tab ) {
+   tab->n_used = 0;
+}
+
+/* Add a register to a usage table.  Combine incoming read uses with
+   existing write uses into a modify use, and vice versa.  Do not
+   create duplicate entries -- each reg should only be mentioned once.  
+*/
+extern void addHRegUse ( HRegUsage*, HRegMode, HReg );
+
+
+
+/*---------------------------------------------------------*/
+/*--- Indicating register remappings (for reg-alloc)    ---*/
+/*---------------------------------------------------------*/
+
+/* Note that such maps can only map virtual regs to real regs.
+   addToHRegRenap will barf if given a pair not of that form.  As a
+   result, no valid HRegRemap will bind a real reg to anything, and so
+   if lookupHRegMap is given a real reg, it returns it unchanged.
+   This is precisely the behaviour that the register allocator needs
+   to impose its decisions on the instructions it processes.  */
+
+#define N_HREG_REMAP 5
+
+typedef
+   struct {
+      HReg orig       [N_HREG_REMAP];
+      HReg replacement[N_HREG_REMAP];
+      Int  n_used;
+   }
+   HRegRemap;
+
+extern void ppHRegRemap     ( HRegRemap* );
+extern void initHRegRemap   ( HRegRemap* );
+extern void addToHRegRemap  ( HRegRemap*, HReg, HReg );
+extern HReg lookupHRegRemap ( HRegRemap*, HReg );
+
+
+/*---------------------------------------------------------*/
+/*--- Abstract instructions                             ---*/
+/*---------------------------------------------------------*/
+
+/* A type is needed to refer to pointers to instructions of any
+   target.  Defining it like this means that HInstr* can stand in for
+   X86Instr*, ArmInstr*, etc. */
+
+typedef  void  HInstr;
+
+
+/* An expandable array of HInstr*'s.  Handy for insn selection and
+   register allocation.  n_vregs indicates the number of virtual
+   registers mentioned in the code, something that reg-alloc needs to
+   know.  These are required to be numbered 0 .. n_vregs-1. 
+*/
+typedef
+   struct {
+      HInstr** arr;
+      Int      arr_size;
+      Int      arr_used;
+      Int      n_vregs;
+   }
+   HInstrArray;
+
+extern HInstrArray* newHInstrArray ( void );
+extern void         addHInstr ( HInstrArray*, HInstr* );
+
+
+/*---------------------------------------------------------*/
+/*--- Reg alloc: TODO: move somewhere else              ---*/
+/*---------------------------------------------------------*/
+
+extern
+HInstrArray* doRegisterAllocation (
+
+   /* Incoming virtual-registerised code. */ 
+   HInstrArray* instrs_in,
+
+   /* An array listing all the real registers the allocator may use,
+      in no particular order. */
+   HReg* available_real_regs,
+   Int   n_available_real_regs,
+
+   /* Return True iff the given insn is a reg-reg move, in which
+      case also return the src and dst regs. */
+   Bool (*isMove) (HInstr*, HReg*, HReg*),
+
+   /* Get info about register usage in this insn. */
+   void (*getRegUsage) (HRegUsage*, HInstr*, Bool),
+
+   /* Apply a reg-reg mapping to an insn. */
+   void (*mapRegs) (HRegRemap*, HInstr*, Bool),
+
+   /* Return insn(s) to spill/restore a real reg to a spill slot
+      offset.  And optionally a function to do direct reloads. */
+   void    (*genSpill) (  HInstr**, HInstr**, HReg, Int, Bool ),
+   void    (*genReload) ( HInstr**, HInstr**, HReg, Int, Bool ),
+   HInstr* (*directReload) ( HInstr*, HReg, Short ),
+   Int     guest_sizeB,
+
+   /* For debug printing only. */
+   void (*ppInstr) ( HInstr*, Bool ),
+   void (*ppReg) ( HReg ),
+
+   /* 32/64bit mode */
+   Bool mode64
+);
+
+
+#endif /* ndef __VEX_HOST_GENERIC_REGS_H */
+
+/*---------------------------------------------------------------*/
+/*---                                     host_generic_regs.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_generic_simd128.c b/VEX/priv/host_generic_simd128.c
new file mode 100644
index 0000000..8ed5166
--- /dev/null
+++ b/VEX/priv/host_generic_simd128.c

@@ -0,0 +1,220 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                            host_generic_simd128.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2010-2010 OpenWorks GbR
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Generic helper functions for doing 128-bit SIMD arithmetic in cases
+   where the instruction selectors cannot generate code in-line.
+   These are purely back-end entities and cannot be seen/referenced
+   from IR. */
+
+#include "libvex_basictypes.h"
+#include "host_generic_simd128.h"
+
+
+/* Primitive helpers always take args of the real type (signed vs
+   unsigned) but return an unsigned result, so there's no conversion
+   weirdness when stuffing results back in the V128 union fields,
+   which are all unsigned. */
+
+static inline UInt mul32 ( Int xx, Int yy )
+{
+   Int t = ((Int)xx) * ((Int)yy);
+   return toUInt(t);
+}
+
+static inline UInt max32S ( Int xx, Int yy )
+{
+   return toUInt((xx > yy) ? xx : yy);
+}
+
+static inline UInt min32S ( Int xx, Int yy )
+{
+   return toUInt((xx < yy) ? xx : yy);
+}
+
+static inline UInt max32U ( UInt xx, UInt yy )
+{
+   return toUInt((xx > yy) ? xx : yy);
+}
+
+static inline UInt min32U ( UInt xx, UInt yy )
+{
+   return toUInt((xx < yy) ? xx : yy);
+}
+
+static inline UShort max16U ( UShort xx, UShort yy )
+{
+   return toUShort((xx > yy) ? xx : yy);
+}
+
+static inline UShort min16U ( UShort xx, UShort yy )
+{
+   return toUShort((xx < yy) ? xx : yy);
+}
+
+static inline UChar max8S ( Char xx, Char yy )
+{
+   return toUChar((xx > yy) ? xx : yy);
+}
+
+static inline UChar min8S ( Char xx, Char yy )
+{
+   return toUChar((xx < yy) ? xx : yy);
+}
+
+static inline ULong cmpGT64S ( Long xx, Long yy )
+{
+   return (((Long)xx) > ((Long)yy))
+             ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
+}
+
+void h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
+                              V128* argL, V128* argR )
+{
+   res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
+   res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
+   res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
+   res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
+   res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
+   res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
+   res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
+   res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
+   res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
+   res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
+   res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
+   res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
+   res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
+   res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
+   res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
+   res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
+}
+
+void h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
+   res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
+   res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
+   res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
+   res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
+   res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
+   res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
+   res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
+}
+
+void h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
+   res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
+   res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
+   res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
+   res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
+   res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
+   res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
+   res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
+}
+
+void h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
+   res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
+   res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
+   res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
+   res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
+   res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
+   res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
+   res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
+   res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
+   res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
+   res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
+   res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
+   res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
+   res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
+   res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
+   res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
+}
+
+void h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
+                               V128* argL, V128* argR )
+{
+   res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
+   res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
+   res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
+   res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
+   res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
+   res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
+   res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
+   res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
+   res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
+   res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
+   res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
+   res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
+   res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
+   res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
+   res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
+   res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
+}
+
+void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
+                                 V128* argL, V128* argR )
+{
+   res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
+   res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                              host_generic_simd128.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_generic_simd128.h b/VEX/priv/host_generic_simd128.h
new file mode 100644
index 0000000..53850cb
--- /dev/null
+++ b/VEX/priv/host_generic_simd128.h

@@ -0,0 +1,67 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                             host_generic_simd128.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2010-2010 OpenWorks GbR
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+*/
+
+/* Generic helper functions for doing 128-bit SIMD arithmetic in cases
+   where the instruction selectors cannot generate code in-line.
+   These are purely back-end entities and cannot be seen/referenced
+   as clean helper functions from IR.
+
+   These will get called from generated code and therefore should be
+   well behaved -- no floating point or mmx insns, just straight
+   integer code.
+
+   Each function implements the correspondingly-named IR primop.
+*/
+
+#ifndef __VEX_HOST_GENERIC_SIMD128_H
+#define __VEX_HOST_GENERIC_SIMD128_H
+
+#include "libvex_basictypes.h"
+
+/* DO NOT MAKE THESE INTO REGPARM FNS!  THIS WILL BREAK CALLING
+   SEQUENCES GENERATED BY host-x86/isel.c. */
+
+extern void h_generic_calc_Mul32x4    ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max32Sx4   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min32Sx4   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max32Ux4   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min32Ux4   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max16Ux8   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min16Ux8   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Max8Sx16   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_Min8Sx16   ( /*OUT*/V128*, V128*, V128* );
+extern void h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128*, V128*, V128* );
+
+
+#endif /* ndef __VEX_HOST_GENERIC_SIMD128_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                              host_generic_simd128.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_generic_simd64.c b/VEX/priv/host_generic_simd64.c
new file mode 100644
index 0000000..03d6d2f
--- /dev/null
+++ b/VEX/priv/host_generic_simd64.c

@@ -0,0 +1,1337 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                             host_generic_simd64.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+/* Generic helper functions for doing 64-bit SIMD arithmetic in cases
+   where the instruction selectors cannot generate code in-line.
+   These are purely back-end entities and cannot be seen/referenced
+   from IR. */
+
+#include "libvex_basictypes.h"
+#include "host_generic_simd64.h"
+
+
+
+/* Tuple/select functions for 32x2 vectors. */
+
+static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
+   return (((ULong)w1) << 32) | ((ULong)w0);
+}
+
+static inline UInt sel32x2_1 ( ULong w64 ) {
+   return 0xFFFFFFFF & toUInt(w64 >> 32);
+}
+static inline UInt sel32x2_0 ( ULong w64 ) {
+   return 0xFFFFFFFF & toUInt(w64);
+}
+
+
+/* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
+   with 64-bit shifts so we give it a hand. */
+
+static inline ULong mk16x4 ( UShort w3, UShort w2, 
+                             UShort w1, UShort w0 ) {
+   UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
+   UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
+   return mk32x2(hi32, lo32);
+}
+
+static inline UShort sel16x4_3 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUShort(0xFFFF & (hi32 >> 16));
+}
+static inline UShort sel16x4_2 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUShort(0xFFFF & hi32);
+}
+static inline UShort sel16x4_1 ( ULong w64 ) {
+   UInt lo32 = (UInt)w64;
+   return toUShort(0xFFFF & (lo32 >> 16));
+}
+static inline UShort sel16x4_0 ( ULong w64 ) {
+   UInt lo32 = (UInt)w64;
+   return toUShort(0xFFFF & lo32);
+}
+
+
+/* Tuple/select functions for 8x8 vectors. */
+
+static inline ULong mk8x8 ( UChar w7, UChar w6,
+                            UChar w5, UChar w4,
+                            UChar w3, UChar w2,
+                            UChar w1, UChar w0 ) {
+   UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
+               | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
+   UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
+               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
+   return mk32x2(hi32, lo32);
+}
+
+static inline UChar sel8x8_7 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUChar(0xFF & (hi32 >> 24));
+}
+static inline UChar sel8x8_6 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUChar(0xFF & (hi32 >> 16));
+}
+static inline UChar sel8x8_5 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUChar(0xFF & (hi32 >> 8));
+}
+static inline UChar sel8x8_4 ( ULong w64 ) {
+   UInt hi32 = toUInt(w64 >> 32);
+   return toUChar(0xFF & (hi32 >> 0));
+}
+static inline UChar sel8x8_3 ( ULong w64 ) {
+   UInt lo32 = (UInt)w64;
+   return toUChar(0xFF & (lo32 >> 24));
+}
+static inline UChar sel8x8_2 ( ULong w64 ) {
+   UInt lo32 = (UInt)w64;
+   return toUChar(0xFF & (lo32 >> 16));
+}
+static inline UChar sel8x8_1 ( ULong w64 ) {
+   UInt lo32 = (UInt)w64;
+   return toUChar(0xFF & (lo32 >> 8));
+}
+static inline UChar sel8x8_0 ( ULong w64 ) {
+   UInt lo32 = (UInt)w64;
+   return toUChar(0xFF & (lo32 >> 0));
+}
+
+static inline UChar index8x8 ( ULong w64, UChar ix ) {
+   ix &= 7;
+   return toUChar((w64 >> (8*ix)) & 0xFF);
+}
+
+
+/* Scalar helpers. */
+
+static inline Short qadd16S ( Short xx, Short yy ) 
+{
+   Int t = ((Int)xx) + ((Int)yy);
+   if (t < -32768) t = -32768;
+   if (t > 32767)  t = 32767;
+   return (Short)t;
+}
+
+static inline Char qadd8S ( Char xx, Char yy )
+{
+   Int t = ((Int)xx) + ((Int)yy);
+   if (t < -128) t = -128;
+   if (t > 127)  t = 127;
+   return (Char)t;
+}
+
+static inline UShort qadd16U ( UShort xx, UShort yy )
+{
+   UInt t = ((UInt)xx) + ((UInt)yy);
+   if (t > 0xFFFF) t = 0xFFFF;
+   return (UShort)t;
+}
+
+static inline UChar qadd8U ( UChar xx, UChar yy )
+{
+   UInt t = ((UInt)xx) + ((UInt)yy);
+   if (t > 0xFF) t = 0xFF;
+   return (UChar)t;
+}
+
+static inline Short qsub16S ( Short xx, Short yy )
+{
+   Int t = ((Int)xx) - ((Int)yy);
+   if (t < -32768) t = -32768;
+   if (t > 32767)  t = 32767;
+   return (Short)t;
+}
+
+static inline Char qsub8S ( Char xx, Char yy )
+{
+   Int t = ((Int)xx) - ((Int)yy);
+   if (t < -128) t = -128;
+   if (t > 127)  t = 127;
+   return (Char)t;
+}
+
+static inline UShort qsub16U ( UShort xx, UShort yy )
+{
+   Int t = ((Int)xx) - ((Int)yy);
+   if (t < 0)      t = 0;
+   if (t > 0xFFFF) t = 0xFFFF;
+   return (UShort)t;
+}
+
+static inline UChar qsub8U ( UChar xx, UChar yy )
+{
+   Int t = ((Int)xx) - ((Int)yy);
+   if (t < 0)    t = 0;
+   if (t > 0xFF) t = 0xFF;
+   return (UChar)t;
+}
+
+static inline Short mul16 ( Short xx, Short yy )
+{
+   Int t = ((Int)xx) * ((Int)yy);
+   return (Short)t;
+}
+
+static inline Int mul32 ( Int xx, Int yy )
+{
+   Int t = ((Int)xx) * ((Int)yy);
+   return (Int)t;
+}
+
+static inline Short mulhi16S ( Short xx, Short yy )
+{
+   Int t = ((Int)xx) * ((Int)yy);
+   t >>=/*s*/ 16;
+   return (Short)t;
+}
+
+static inline UShort mulhi16U ( UShort xx, UShort yy )
+{
+   UInt t = ((UInt)xx) * ((UInt)yy);
+   t >>=/*u*/ 16;
+   return (UShort)t;
+}
+
+static inline UInt cmpeq32 ( UInt xx, UInt yy )
+{
+   return xx==yy ? 0xFFFFFFFF : 0;
+}
+
+static inline UShort cmpeq16 ( UShort xx, UShort yy )
+{
+   return toUShort(xx==yy ? 0xFFFF : 0);
+}
+
+static inline UChar cmpeq8 ( UChar xx, UChar yy )
+{
+   return toUChar(xx==yy ? 0xFF : 0);
+}
+
+static inline UInt cmpgt32S ( Int xx, Int yy )
+{
+   return xx>yy ? 0xFFFFFFFF : 0;
+}
+
+static inline UShort cmpgt16S ( Short xx, Short yy )
+{
+   return toUShort(xx>yy ? 0xFFFF : 0);
+}
+
+static inline UChar cmpgt8S ( Char xx, Char yy )
+{
+   return toUChar(xx>yy ? 0xFF : 0);
+}
+
+static inline UInt cmpnez32 ( UInt xx )
+{
+   return xx==0 ? 0 : 0xFFFFFFFF;
+}
+
+static inline UShort cmpnez16 ( UShort xx )
+{
+   return toUShort(xx==0 ? 0 : 0xFFFF);
+}
+
+static inline UChar cmpnez8 ( UChar xx )
+{
+   return toUChar(xx==0 ? 0 : 0xFF);
+}
+
+static inline Short qnarrow32Sto16 ( UInt xx0 )
+{
+   Int xx = (Int)xx0;
+   if (xx < -32768) xx = -32768;
+   if (xx > 32767)  xx = 32767;
+   return (Short)xx;
+}
+
+static inline Char qnarrow16Sto8 ( UShort xx0 )
+{
+   Short xx = (Short)xx0;
+   if (xx < -128) xx = -128;
+   if (xx > 127)  xx = 127;
+   return (Char)xx;
+}
+
+static inline UChar qnarrow16Uto8 ( UShort xx0 )
+{
+   Short xx = (Short)xx0;
+   if (xx < 0)   xx = 0;
+   if (xx > 255) xx = 255;
+   return (UChar)xx;
+}
+
+/* shifts: we don't care about out-of-range ones, since
+   that is dealt with at a higher level. */
+
+static inline UChar shl8 ( UChar v, UInt n )
+{
+   return toUChar(v << n);
+}
+
+static inline UChar sar8 ( UChar v, UInt n )
+{
+   return toUChar(((Char)v) >> n);
+}
+
+static inline UShort shl16 ( UShort v, UInt n )
+{
+   return toUShort(v << n);
+}
+
+static inline UShort shr16 ( UShort v, UInt n )
+{
+   return toUShort((((UShort)v) >> n));
+}
+
+static inline UShort sar16 ( UShort v, UInt n )
+{
+   return toUShort(((Short)v) >> n);
+}
+
+static inline UInt shl32 ( UInt v, UInt n )
+{
+   return v << n;
+}
+
+static inline UInt shr32 ( UInt v, UInt n )
+{
+   return (((UInt)v) >> n);
+}
+
+static inline UInt sar32 ( UInt v, UInt n )
+{
+   return ((Int)v) >> n;
+}
+
+static inline UChar avg8U ( UChar xx, UChar yy )
+{
+   UInt xxi = (UInt)xx;
+   UInt yyi = (UInt)yy;
+   UInt r   = (xxi + yyi + 1) >> 1;
+   return (UChar)r;
+}
+
+static inline UShort avg16U ( UShort xx, UShort yy )
+{
+   UInt xxi = (UInt)xx;
+   UInt yyi = (UInt)yy;
+   UInt r   = (xxi + yyi + 1) >> 1;
+   return (UShort)r;
+}
+
+static inline Short max16S ( Short xx, Short yy )
+{
+   return toUShort((xx > yy) ? xx : yy);
+}
+
+static inline UChar max8U ( UChar xx, UChar yy )
+{
+   return toUChar((xx > yy) ? xx : yy);
+}
+
+static inline Short min16S ( Short xx, Short yy )
+{
+   return toUShort((xx < yy) ? xx : yy);
+}
+
+static inline UChar min8U ( UChar xx, UChar yy )
+{
+   return toUChar((xx < yy) ? xx : yy);
+}
+
+static inline UShort hadd16U ( UShort xx, UShort yy )
+{
+   UInt xxi = (UInt)xx;
+   UInt yyi = (UInt)yy;
+   UInt r   = (xxi + yyi) >> 1;
+   return (UShort)r;
+}
+
+static inline Short hadd16S ( Short xx, Short yy )
+{
+   Int xxi = (Int)xx;
+   Int yyi = (Int)yy;
+   Int r   = (xxi + yyi) >> 1;
+   return (Short)r;
+}
+
+static inline UShort hsub16U ( UShort xx, UShort yy )
+{
+   UInt xxi = (UInt)xx;
+   UInt yyi = (UInt)yy;
+   UInt r   = (xxi - yyi) >> 1;
+   return (UShort)r;
+}
+
+static inline Short hsub16S ( Short xx, Short yy )
+{
+   Int xxi = (Int)xx;
+   Int yyi = (Int)yy;
+   Int r   = (xxi - yyi) >> 1;
+   return (Short)r;
+}
+
+static inline UChar hadd8U ( UChar xx, UChar yy )
+{
+   UInt xxi = (UInt)xx;
+   UInt yyi = (UInt)yy;
+   UInt r   = (xxi + yyi) >> 1;
+   return (UChar)r;
+}
+
+static inline Char hadd8S ( Char xx, Char yy )
+{
+   Int xxi = (Int)xx;
+   Int yyi = (Int)yy;
+   Int r   = (xxi + yyi) >> 1;
+   return (Char)r;
+}
+
+static inline UChar hsub8U ( UChar xx, UChar yy )
+{
+   UInt xxi = (UInt)xx;
+   UInt yyi = (UInt)yy;
+   UInt r   = (xxi - yyi) >> 1;
+   return (UChar)r;
+}
+
+static inline Char hsub8S ( Char xx, Char yy )
+{
+   Int xxi = (Int)xx;
+   Int yyi = (Int)yy;
+   Int r   = (xxi - yyi) >> 1;
+   return (Char)r;
+}
+
+static inline UInt absdiff8U ( UChar xx, UChar yy )
+{
+   UInt xxu = (UChar)xx;
+   UInt yyu = (UChar)yy;
+   return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
+}
+
+/* ----------------------------------------------------- */
+/* Start of the externally visible functions.  These simply
+   implement the corresponding IR primops. */
+/* ----------------------------------------------------- */
+
+/* ------------ Normal addition ------------ */
+
+ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
+{
+   return mk32x2(
+             sel32x2_1(xx) + sel32x2_1(yy),
+             sel32x2_0(xx) + sel32x2_0(yy)
+          );
+}
+
+ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
+             toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
+             toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
+             toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
+          );
+}
+
+ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
+{
+   return mk8x8(
+             toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
+             toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
+             toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
+             toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
+             toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
+             toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
+             toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
+             toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
+          );
+}
+
+/* ------------ Saturating addition ------------ */
+
+ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
+             qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
+             qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
+             qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
+          );
+}
+
+ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
+{
+   return mk8x8(
+             qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
+             qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
+             qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
+             qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
+             qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
+             qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
+             qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
+             qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
+          );
+}
+
+ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
+             qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
+             qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
+             qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
+          );
+}
+
+ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
+{
+   return mk8x8(
+             qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
+             qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
+             qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
+             qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
+             qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
+             qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
+             qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
+             qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
+          );
+}
+
+/* ------------ Normal subtraction ------------ */
+
+ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
+{
+   return mk32x2(
+             sel32x2_1(xx) - sel32x2_1(yy),
+             sel32x2_0(xx) - sel32x2_0(yy)
+          );
+}
+
+ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
+             toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
+             toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
+             toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
+          );
+}
+
+ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
+{
+   return mk8x8(
+             toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
+             toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
+             toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
+             toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
+             toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
+             toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
+             toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
+             toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
+          );
+}
+
+/* ------------ Saturating subtraction ------------ */
+
+ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
+             qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
+             qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
+             qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
+          );
+}
+
+ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
+{
+   return mk8x8(
+             qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
+             qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
+             qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
+             qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
+             qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
+             qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
+             qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
+             qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
+          );
+}
+
+ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
+             qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
+             qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
+             qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
+          );
+}
+
+ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
+{
+   return mk8x8(
+             qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
+             qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
+             qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
+             qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
+             qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
+             qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
+             qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
+             qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
+          );
+}
+
+/* ------------ Multiplication ------------ */
+
+ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             mul16( sel16x4_3(xx), sel16x4_3(yy) ),
+             mul16( sel16x4_2(xx), sel16x4_2(yy) ),
+             mul16( sel16x4_1(xx), sel16x4_1(yy) ),
+             mul16( sel16x4_0(xx), sel16x4_0(yy) )
+          );
+}
+
+ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
+{
+   return mk32x2(
+             mul32( sel32x2_1(xx), sel32x2_1(yy) ),
+             mul32( sel32x2_0(xx), sel32x2_0(yy) )
+          );
+}
+
+ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
+             mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
+             mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
+             mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
+          );
+}
+
+ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
+             mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
+             mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
+             mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
+          );
+}
+
+/* ------------ Comparison ------------ */
+
+ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
+{
+   return mk32x2(
+             cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
+             cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
+          );
+}
+
+ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
+             cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
+             cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
+             cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
+          );
+}
+
+ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
+{
+   return mk8x8(
+             cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
+             cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
+             cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
+             cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
+             cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
+             cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
+             cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
+             cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
+          );
+}
+
+ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
+{
+   return mk32x2(
+             cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
+             cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
+          );
+}
+
+ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
+             cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
+             cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
+             cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
+          );
+}
+
+ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
+{
+   return mk8x8(
+             cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
+             cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
+             cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
+             cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
+             cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
+             cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
+             cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
+             cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
+          );
+}
+
+ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
+{
+   return mk32x2(
+             cmpnez32( sel32x2_1(xx) ),
+             cmpnez32( sel32x2_0(xx) )
+          );
+}
+
+ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
+{
+   return mk16x4(
+             cmpnez16( sel16x4_3(xx) ),
+             cmpnez16( sel16x4_2(xx) ),
+             cmpnez16( sel16x4_1(xx) ),
+             cmpnez16( sel16x4_0(xx) )
+          );
+}
+
+ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
+{
+   return mk8x8(
+             cmpnez8( sel8x8_7(xx) ),
+             cmpnez8( sel8x8_6(xx) ),
+             cmpnez8( sel8x8_5(xx) ),
+             cmpnez8( sel8x8_4(xx) ),
+             cmpnez8( sel8x8_3(xx) ),
+             cmpnez8( sel8x8_2(xx) ),
+             cmpnez8( sel8x8_1(xx) ),
+             cmpnez8( sel8x8_0(xx) )
+          );
+}
+
+/* ------------ Saturating narrowing ------------ */
+
+ULong h_generic_calc_QNarrow32Sx2 ( ULong aa, ULong bb )
+{
+   UInt d = sel32x2_1(aa);
+   UInt c = sel32x2_0(aa);
+   UInt b = sel32x2_1(bb);
+   UInt a = sel32x2_0(bb);
+   return mk16x4( 
+             qnarrow32Sto16(d),
+             qnarrow32Sto16(c),
+             qnarrow32Sto16(b),
+             qnarrow32Sto16(a)
+          );
+}
+
+ULong h_generic_calc_QNarrow16Sx4 ( ULong aa, ULong bb )
+{
+   UShort h = sel16x4_3(aa);
+   UShort g = sel16x4_2(aa);
+   UShort f = sel16x4_1(aa);
+   UShort e = sel16x4_0(aa);
+   UShort d = sel16x4_3(bb);
+   UShort c = sel16x4_2(bb);
+   UShort b = sel16x4_1(bb);
+   UShort a = sel16x4_0(bb);
+   return mk8x8( 
+             qnarrow16Sto8(h),
+             qnarrow16Sto8(g),
+             qnarrow16Sto8(f),
+             qnarrow16Sto8(e),
+             qnarrow16Sto8(d),
+             qnarrow16Sto8(c),
+             qnarrow16Sto8(b),
+             qnarrow16Sto8(a)
+          );
+}
+
+ULong h_generic_calc_QNarrow16Ux4 ( ULong aa, ULong bb )
+{
+   UShort h = sel16x4_3(aa);
+   UShort g = sel16x4_2(aa);
+   UShort f = sel16x4_1(aa);
+   UShort e = sel16x4_0(aa);
+   UShort d = sel16x4_3(bb);
+   UShort c = sel16x4_2(bb);
+   UShort b = sel16x4_1(bb);
+   UShort a = sel16x4_0(bb);
+   return mk8x8( 
+             qnarrow16Uto8(h),
+             qnarrow16Uto8(g),
+             qnarrow16Uto8(f),
+             qnarrow16Uto8(e),
+             qnarrow16Uto8(d),
+             qnarrow16Uto8(c),
+             qnarrow16Uto8(b),
+             qnarrow16Uto8(a)
+          );
+}
+
+/* ------------ Interleaving ------------ */
+
+ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
+{
+   return mk8x8(
+             sel8x8_7(aa),
+             sel8x8_7(bb),
+             sel8x8_6(aa),
+             sel8x8_6(bb),
+             sel8x8_5(aa),
+             sel8x8_5(bb),
+             sel8x8_4(aa),
+             sel8x8_4(bb)
+          );
+}
+
+ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
+{
+   return mk8x8(
+             sel8x8_3(aa),
+             sel8x8_3(bb),
+             sel8x8_2(aa),
+             sel8x8_2(bb),
+             sel8x8_1(aa),
+             sel8x8_1(bb),
+             sel8x8_0(aa),
+             sel8x8_0(bb)
+          );
+}
+
+ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
+{
+   return mk16x4(
+             sel16x4_3(aa),
+             sel16x4_3(bb),
+             sel16x4_2(aa),
+             sel16x4_2(bb)
+          );
+}
+
+ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
+{
+   return mk16x4(
+             sel16x4_1(aa),
+             sel16x4_1(bb),
+             sel16x4_0(aa),
+             sel16x4_0(bb)
+          );
+}
+
+ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
+{
+   return mk32x2(
+             sel32x2_1(aa),
+             sel32x2_1(bb)
+          );
+}
+
+ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
+{
+   return mk32x2(
+             sel32x2_0(aa),
+             sel32x2_0(bb)
+          );
+}
+
+/* ------------ Concatenation ------------ */
+
+ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
+{
+   return mk16x4(
+             sel16x4_3(aa),
+             sel16x4_1(aa),
+             sel16x4_3(bb),
+             sel16x4_1(bb)
+          );
+}
+
+ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
+{
+   return mk16x4(
+             sel16x4_2(aa),
+             sel16x4_0(aa),
+             sel16x4_2(bb),
+             sel16x4_0(bb)
+          );
+}
+
+/* misc hack looking for a proper home */
+ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
+{
+   return mk8x8(
+             index8x8(aa, sel8x8_7(bb)),
+             index8x8(aa, sel8x8_6(bb)),
+             index8x8(aa, sel8x8_5(bb)),
+             index8x8(aa, sel8x8_4(bb)),
+             index8x8(aa, sel8x8_3(bb)),
+             index8x8(aa, sel8x8_2(bb)),
+             index8x8(aa, sel8x8_1(bb)),
+             index8x8(aa, sel8x8_0(bb))
+          );
+}
+
+/* ------------ Shifting ------------ */
+/* Note that because these primops are undefined if the shift amount
+   equals or exceeds the lane width, the shift amount is masked so
+   that the scalar shifts are always in range.  In fact, given the
+   semantics of these primops (ShlN16x4, etc) it is an error if in
+   fact we are ever given an out-of-range shift amount. 
+*/
+ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
+{
+   /* vassert(nn < 32); */
+   nn &= 31;
+   return mk32x2(
+             shl32( sel32x2_1(xx), nn ),
+             shl32( sel32x2_0(xx), nn )
+          );
+}
+
+ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
+{
+   /* vassert(nn < 16); */
+   nn &= 15;
+   return mk16x4(
+             shl16( sel16x4_3(xx), nn ),
+             shl16( sel16x4_2(xx), nn ),
+             shl16( sel16x4_1(xx), nn ),
+             shl16( sel16x4_0(xx), nn )
+          );
+}
+
+ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
+{
+   /* vassert(nn < 8); */
+   nn &= 7;
+   return mk8x8(
+             shl8( sel8x8_7(xx), nn ),
+             shl8( sel8x8_6(xx), nn ),
+             shl8( sel8x8_5(xx), nn ),
+             shl8( sel8x8_4(xx), nn ),
+             shl8( sel8x8_3(xx), nn ),
+             shl8( sel8x8_2(xx), nn ),
+             shl8( sel8x8_1(xx), nn ),
+             shl8( sel8x8_0(xx), nn )
+          );
+}
+
+ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
+{
+   /* vassert(nn < 32); */
+   nn &= 31;
+   return mk32x2(
+             shr32( sel32x2_1(xx), nn ),
+             shr32( sel32x2_0(xx), nn )
+          );
+}
+
+ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
+{
+   /* vassert(nn < 16); */
+   nn &= 15;
+   return mk16x4(
+             shr16( sel16x4_3(xx), nn ),
+             shr16( sel16x4_2(xx), nn ),
+             shr16( sel16x4_1(xx), nn ),
+             shr16( sel16x4_0(xx), nn )
+          );
+}
+
+ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
+{
+   /* vassert(nn < 32); */
+   nn &= 31;
+   return mk32x2(
+             sar32( sel32x2_1(xx), nn ),
+             sar32( sel32x2_0(xx), nn )
+          );
+}
+
+ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
+{
+   /* vassert(nn < 16); */
+   nn &= 15;
+   return mk16x4(
+             sar16( sel16x4_3(xx), nn ),
+             sar16( sel16x4_2(xx), nn ),
+             sar16( sel16x4_1(xx), nn ),
+             sar16( sel16x4_0(xx), nn )
+          );
+}
+
+ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
+{
+   /* vassert(nn < 8); */
+   nn &= 7;
+   return mk8x8(
+             sar8( sel8x8_7(xx), nn ),
+             sar8( sel8x8_6(xx), nn ),
+             sar8( sel8x8_5(xx), nn ),
+             sar8( sel8x8_4(xx), nn ),
+             sar8( sel8x8_3(xx), nn ),
+             sar8( sel8x8_2(xx), nn ),
+             sar8( sel8x8_1(xx), nn ),
+             sar8( sel8x8_0(xx), nn )
+          );
+}
+
+/* ------------ Averaging ------------ */
+
+ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
+{
+   return mk8x8(
+             avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
+             avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
+             avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
+             avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
+             avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
+             avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
+             avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
+             avg8U( sel8x8_0(xx), sel8x8_0(yy) )
+          );
+}
+
+ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
+             avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
+             avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
+             avg16U( sel16x4_0(xx), sel16x4_0(yy) )
+          );
+}
+
+/* ------------ max/min ------------ */
+
+ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             max16S( sel16x4_3(xx), sel16x4_3(yy) ),
+             max16S( sel16x4_2(xx), sel16x4_2(yy) ),
+             max16S( sel16x4_1(xx), sel16x4_1(yy) ),
+             max16S( sel16x4_0(xx), sel16x4_0(yy) )
+          );
+}
+
+ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
+{
+   return mk8x8(
+             max8U( sel8x8_7(xx), sel8x8_7(yy) ),
+             max8U( sel8x8_6(xx), sel8x8_6(yy) ),
+             max8U( sel8x8_5(xx), sel8x8_5(yy) ),
+             max8U( sel8x8_4(xx), sel8x8_4(yy) ),
+             max8U( sel8x8_3(xx), sel8x8_3(yy) ),
+             max8U( sel8x8_2(xx), sel8x8_2(yy) ),
+             max8U( sel8x8_1(xx), sel8x8_1(yy) ),
+             max8U( sel8x8_0(xx), sel8x8_0(yy) )
+          );
+}
+
+ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
+{
+   return mk16x4(
+             min16S( sel16x4_3(xx), sel16x4_3(yy) ),
+             min16S( sel16x4_2(xx), sel16x4_2(yy) ),
+             min16S( sel16x4_1(xx), sel16x4_1(yy) ),
+             min16S( sel16x4_0(xx), sel16x4_0(yy) )
+          );
+}
+
+ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
+{
+   return mk8x8(
+             min8U( sel8x8_7(xx), sel8x8_7(yy) ),
+             min8U( sel8x8_6(xx), sel8x8_6(yy) ),
+             min8U( sel8x8_5(xx), sel8x8_5(yy) ),
+             min8U( sel8x8_4(xx), sel8x8_4(yy) ),
+             min8U( sel8x8_3(xx), sel8x8_3(yy) ),
+             min8U( sel8x8_2(xx), sel8x8_2(yy) ),
+             min8U( sel8x8_1(xx), sel8x8_1(yy) ),
+             min8U( sel8x8_0(xx), sel8x8_0(yy) )
+          );
+}
+
+/* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
+
+/* Tuple/select functions for 16x2 vectors. */
+static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
+   return (((UInt)w1) << 16) | ((UInt)w2);
+}
+
+static inline UShort sel16x2_1 ( UInt w32 ) {
+   return 0xFFFF & (UShort)(w32 >> 16);
+}
+static inline UShort sel16x2_0 ( UInt w32 ) {
+   return 0xFFFF & (UShort)(w32);
+}
+
+static inline UInt mk8x4 ( UChar w3, UChar w2,
+                           UChar w1, UChar w0 ) {
+   UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
+              | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
+   return w32;
+}
+
+static inline UChar sel8x4_3 ( UInt w32 ) {
+   return toUChar(0xFF & (w32 >> 24));
+}
+static inline UChar sel8x4_2 ( UInt w32 ) {
+   return toUChar(0xFF & (w32 >> 16));
+}
+static inline UChar sel8x4_1 ( UInt w32 ) {
+   return toUChar(0xFF & (w32 >> 8));
+}
+static inline UChar sel8x4_0 ( UInt w32 ) {
+   return toUChar(0xFF & (w32 >> 0));
+}
+
+
+/* ----------------------------------------------------- */
+/* More externally visible functions.  These simply
+   implement the corresponding IR primops. */
+/* ----------------------------------------------------- */
+
+/* ------ 16x2 ------ */
+
+UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
+{
+   return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
+                  sel16x2_0(xx) + sel16x2_0(yy) );
+}
+
+UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
+{
+   return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
+                  sel16x2_0(xx) - sel16x2_0(yy) );
+}
+
+UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
+{
+   return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
+                  hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
+}
+
+UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
+{
+   return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
+                  hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
+}
+
+UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
+{
+   return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
+                  hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
+}
+
+UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
+{
+   return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
+                  hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
+}
+
+UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
+{
+   return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
+                  qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
+}
+
+UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
+{
+   return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
+                  qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
+}
+
+UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
+{
+   return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
+                  qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
+}
+
+UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
+{
+   return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
+                  qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
+}
+
+/* ------ 8x4 ------ */
+
+UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
+{
+   return mk8x4(
+             sel8x4_3(xx) + sel8x4_3(yy),
+             sel8x4_2(xx) + sel8x4_2(yy),
+             sel8x4_1(xx) + sel8x4_1(yy),
+             sel8x4_0(xx) + sel8x4_0(yy)
+          );
+}
+
+UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
+{
+   return mk8x4(
+             sel8x4_3(xx) - sel8x4_3(yy),
+             sel8x4_2(xx) - sel8x4_2(yy),
+             sel8x4_1(xx) - sel8x4_1(yy),
+             sel8x4_0(xx) - sel8x4_0(yy)
+          );
+}
+
+UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
+{
+   return mk8x4(
+             hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
+             hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
+             hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
+             hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
+          );
+}
+
+UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
+{
+   return mk8x4(
+             hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
+             hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
+             hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
+             hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
+          );
+}
+
+UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
+{
+   return mk8x4(
+             hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
+             hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
+             hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
+             hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
+          );
+}
+
+UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
+{
+   return mk8x4(
+             hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
+             hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
+             hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
+             hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
+          );
+}
+
+UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
+{
+   return mk8x4(
+             qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
+             qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
+             qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
+             qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
+          );
+}
+
+UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
+{
+   return mk8x4(
+             qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
+             qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
+             qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
+             qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
+          );
+}
+
+UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
+{
+   return mk8x4(
+             qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
+             qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
+             qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
+             qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
+          );
+}
+
+UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
+{
+   return mk8x4(
+             qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
+             qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
+             qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
+             qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
+          );
+}
+
+UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
+{
+   return mk16x2(
+             cmpnez16( sel16x2_1(xx) ),
+             cmpnez16( sel16x2_0(xx) )
+          );
+}
+
+UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
+{
+   return mk8x4(
+             cmpnez8( sel8x4_3(xx) ),
+             cmpnez8( sel8x4_2(xx) ),
+             cmpnez8( sel8x4_1(xx) ),
+             cmpnez8( sel8x4_0(xx) )
+          );
+}
+
+UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
+{
+   return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
+          + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
+          + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
+          + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                               host_generic_simd64.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_generic_simd64.h b/VEX/priv/host_generic_simd64.h
new file mode 100644
index 0000000..e854fc7
--- /dev/null
+++ b/VEX/priv/host_generic_simd64.h

@@ -0,0 +1,161 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                             host_generic_simd64.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+/* Generic helper functions for doing 64-bit SIMD arithmetic in cases
+   where the instruction selectors cannot generate code in-line.
+   These are purely back-end entities and cannot be seen/referenced
+   as clean helper functions from IR.
+
+   These will get called from generated code and therefore should be
+   well behaved -- no floating point or mmx insns, just straight
+   integer code.
+
+   Each function implements the correspondingly-named IR primop.
+*/
+
+#ifndef __VEX_HOST_GENERIC_SIMD64_H
+#define __VEX_HOST_GENERIC_SIMD64_H
+
+#include "libvex_basictypes.h"
+
+/* DO NOT MAKE THESE INTO REGPARM FNS!  THIS WILL BREAK CALLING
+   SEQUENCES GENERATED BY host-x86/isel.c. */
+
+extern ULong h_generic_calc_Add32x2 ( ULong, ULong );
+extern ULong h_generic_calc_Add16x4 ( ULong, ULong );
+extern ULong h_generic_calc_Add8x8  ( ULong, ULong );
+
+extern ULong h_generic_calc_QAdd16Sx4 ( ULong, ULong );
+extern ULong h_generic_calc_QAdd8Sx8  ( ULong, ULong );
+extern ULong h_generic_calc_QAdd16Ux4 ( ULong, ULong );
+extern ULong h_generic_calc_QAdd8Ux8  ( ULong, ULong );
+
+extern ULong h_generic_calc_Sub32x2 ( ULong, ULong );
+extern ULong h_generic_calc_Sub16x4 ( ULong, ULong );
+extern ULong h_generic_calc_Sub8x8  ( ULong, ULong );
+
+extern ULong h_generic_calc_QSub16Sx4 ( ULong, ULong );
+extern ULong h_generic_calc_QSub8Sx8  ( ULong, ULong );
+extern ULong h_generic_calc_QSub16Ux4 ( ULong, ULong );
+extern ULong h_generic_calc_QSub8Ux8  ( ULong, ULong );
+
+extern ULong h_generic_calc_Mul16x4    ( ULong, ULong );
+extern ULong h_generic_calc_Mul32x2    ( ULong, ULong );
+extern ULong h_generic_calc_MulHi16Sx4 ( ULong, ULong );
+extern ULong h_generic_calc_MulHi16Ux4 ( ULong, ULong );
+
+extern ULong h_generic_calc_CmpEQ32x2  ( ULong, ULong );
+extern ULong h_generic_calc_CmpEQ16x4  ( ULong, ULong );
+extern ULong h_generic_calc_CmpEQ8x8   ( ULong, ULong );
+extern ULong h_generic_calc_CmpGT32Sx2 ( ULong, ULong );
+extern ULong h_generic_calc_CmpGT16Sx4 ( ULong, ULong );
+extern ULong h_generic_calc_CmpGT8Sx8  ( ULong, ULong );
+
+extern ULong h_generic_calc_CmpNEZ32x2 ( ULong );
+extern ULong h_generic_calc_CmpNEZ16x4 ( ULong );
+extern ULong h_generic_calc_CmpNEZ8x8  ( ULong );
+
+extern ULong h_generic_calc_QNarrow32Sx2 ( ULong, ULong );
+extern ULong h_generic_calc_QNarrow16Sx4 ( ULong, ULong );
+extern ULong h_generic_calc_QNarrow16Ux4 ( ULong, ULong );
+
+extern ULong h_generic_calc_InterleaveHI8x8 ( ULong, ULong );
+extern ULong h_generic_calc_InterleaveLO8x8 ( ULong, ULong );
+extern ULong h_generic_calc_InterleaveHI16x4 ( ULong, ULong );
+extern ULong h_generic_calc_InterleaveLO16x4 ( ULong, ULong );
+extern ULong h_generic_calc_InterleaveHI32x2 ( ULong, ULong );
+extern ULong h_generic_calc_InterleaveLO32x2 ( ULong, ULong );
+
+extern ULong h_generic_calc_CatOddLanes16x4 ( ULong, ULong );
+extern ULong h_generic_calc_CatEvenLanes16x4 ( ULong, ULong );
+extern ULong h_generic_calc_Perm8x8 ( ULong, ULong );
+
+extern ULong h_generic_calc_ShlN8x8  ( ULong, UInt );
+extern ULong h_generic_calc_ShlN16x4 ( ULong, UInt );
+extern ULong h_generic_calc_ShlN32x2 ( ULong, UInt );
+
+extern ULong h_generic_calc_ShrN16x4 ( ULong, UInt );
+extern ULong h_generic_calc_ShrN32x2 ( ULong, UInt );
+
+extern ULong h_generic_calc_SarN8x8  ( ULong, UInt );
+extern ULong h_generic_calc_SarN16x4 ( ULong, UInt );
+extern ULong h_generic_calc_SarN32x2 ( ULong, UInt );
+
+extern ULong h_generic_calc_Avg8Ux8  ( ULong, ULong );
+extern ULong h_generic_calc_Avg16Ux4 ( ULong, ULong );
+
+extern ULong h_generic_calc_Max16Sx4 ( ULong, ULong );
+extern ULong h_generic_calc_Max8Ux8  ( ULong, ULong );
+extern ULong h_generic_calc_Min16Sx4 ( ULong, ULong );
+extern ULong h_generic_calc_Min8Ux8  ( ULong, ULong );
+
+/* 32-bit SIMD HELPERS */
+
+extern UInt h_generic_calc_Add16x2   ( UInt, UInt );
+extern UInt h_generic_calc_Sub16x2   ( UInt, UInt );
+
+extern UInt h_generic_calc_HAdd16Ux2 ( UInt, UInt );
+extern UInt h_generic_calc_HAdd16Sx2 ( UInt, UInt );
+extern UInt h_generic_calc_HSub16Ux2 ( UInt, UInt );
+extern UInt h_generic_calc_HSub16Sx2 ( UInt, UInt );
+
+extern UInt h_generic_calc_QAdd16Ux2 ( UInt, UInt );
+extern UInt h_generic_calc_QAdd16Sx2 ( UInt, UInt );
+extern UInt h_generic_calc_QSub16Ux2 ( UInt, UInt );
+extern UInt h_generic_calc_QSub16Sx2 ( UInt, UInt );
+
+extern UInt h_generic_calc_Add8x4   ( UInt, UInt );
+extern UInt h_generic_calc_Sub8x4   ( UInt, UInt );
+
+extern UInt h_generic_calc_HAdd8Ux4 ( UInt, UInt );
+extern UInt h_generic_calc_HAdd8Sx4 ( UInt, UInt );
+extern UInt h_generic_calc_HSub8Ux4 ( UInt, UInt );
+extern UInt h_generic_calc_HSub8Sx4 ( UInt, UInt );
+
+extern UInt h_generic_calc_QAdd8Ux4 ( UInt, UInt );
+extern UInt h_generic_calc_QAdd8Sx4 ( UInt, UInt );
+extern UInt h_generic_calc_QSub8Ux4 ( UInt, UInt );
+extern UInt h_generic_calc_QSub8Sx4 ( UInt, UInt );
+
+extern UInt h_generic_calc_Sad8Ux4  ( UInt, UInt );
+
+extern UInt h_generic_calc_CmpNEZ16x2 ( UInt );
+extern UInt h_generic_calc_CmpNEZ8x4  ( UInt );
+
+#endif /* ndef __VEX_HOST_GENERIC_SIMD64_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                               host_generic_simd64.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_ppc_defs.c b/VEX/priv/host_ppc_defs.c
new file mode 100644
index 0000000..54fd2fd
--- /dev/null
+++ b/VEX/priv/host_ppc_defs.c

@@ -0,0 +1,3851 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                   host_ppc_defs.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex.h"
+#include "libvex_trc_values.h"
+
+#include "main_util.h"
+#include "host_generic_regs.h"
+#include "host_ppc_defs.h"
+
+
+/* --------- Registers. --------- */
+
+void ppHRegPPC ( HReg reg ) 
+{
+   Int r;
+   static HChar* ireg32_names[32] 
+      = { "%r0",  "%r1",  "%r2",  "%r3",
+          "%r4",  "%r5",  "%r6",  "%r7",
+          "%r8",  "%r9",  "%r10", "%r11",
+          "%r12", "%r13", "%r14", "%r15",
+          "%r16", "%r17", "%r18", "%r19",
+          "%r20", "%r21", "%r22", "%r23",
+          "%r24", "%r25", "%r26", "%r27",
+          "%r28", "%r29", "%r30", "%r31" };
+   /* Be generic for all virtual regs. */
+   if (hregIsVirtual(reg)) {
+      ppHReg(reg);
+      return;
+   }
+   /* But specific for real regs. */
+   switch (hregClass(reg)) {
+   case HRcInt64:
+      r = hregNumber(reg);
+      vassert(r >= 0 && r < 32);
+      vex_printf("%s", ireg32_names[r]);
+      return;
+   case HRcInt32:
+      r = hregNumber(reg);
+      vassert(r >= 0 && r < 32);
+      vex_printf("%s", ireg32_names[r]);
+      return;
+   case HRcFlt64:
+      r = hregNumber(reg);
+      vassert(r >= 0 && r < 32);
+      vex_printf("%%fr%d", r);
+      return;
+   case HRcVec128:
+      r = hregNumber(reg);
+      vassert(r >= 0 && r < 32);
+      vex_printf("%%v%d", r);
+      return;
+   default:
+      vpanic("ppHRegPPC");
+   }
+}
+
+
+#define MkHRegGPR(_n, _mode64) \
+   mkHReg(_n, _mode64 ? HRcInt64 : HRcInt32, False)
+
+HReg hregPPC_GPR0  ( Bool mode64 ) { return MkHRegGPR( 0, mode64); }
+HReg hregPPC_GPR1  ( Bool mode64 ) { return MkHRegGPR( 1, mode64); }
+HReg hregPPC_GPR2  ( Bool mode64 ) { return MkHRegGPR( 2, mode64); }
+HReg hregPPC_GPR3  ( Bool mode64 ) { return MkHRegGPR( 3, mode64); }
+HReg hregPPC_GPR4  ( Bool mode64 ) { return MkHRegGPR( 4, mode64); }
+HReg hregPPC_GPR5  ( Bool mode64 ) { return MkHRegGPR( 5, mode64); }
+HReg hregPPC_GPR6  ( Bool mode64 ) { return MkHRegGPR( 6, mode64); }
+HReg hregPPC_GPR7  ( Bool mode64 ) { return MkHRegGPR( 7, mode64); }
+HReg hregPPC_GPR8  ( Bool mode64 ) { return MkHRegGPR( 8, mode64); }
+HReg hregPPC_GPR9  ( Bool mode64 ) { return MkHRegGPR( 9, mode64); }
+HReg hregPPC_GPR10 ( Bool mode64 ) { return MkHRegGPR(10, mode64); }
+HReg hregPPC_GPR11 ( Bool mode64 ) { return MkHRegGPR(11, mode64); }
+HReg hregPPC_GPR12 ( Bool mode64 ) { return MkHRegGPR(12, mode64); }
+HReg hregPPC_GPR13 ( Bool mode64 ) { return MkHRegGPR(13, mode64); }
+HReg hregPPC_GPR14 ( Bool mode64 ) { return MkHRegGPR(14, mode64); }
+HReg hregPPC_GPR15 ( Bool mode64 ) { return MkHRegGPR(15, mode64); }
+HReg hregPPC_GPR16 ( Bool mode64 ) { return MkHRegGPR(16, mode64); }
+HReg hregPPC_GPR17 ( Bool mode64 ) { return MkHRegGPR(17, mode64); }
+HReg hregPPC_GPR18 ( Bool mode64 ) { return MkHRegGPR(18, mode64); }
+HReg hregPPC_GPR19 ( Bool mode64 ) { return MkHRegGPR(19, mode64); }
+HReg hregPPC_GPR20 ( Bool mode64 ) { return MkHRegGPR(20, mode64); }
+HReg hregPPC_GPR21 ( Bool mode64 ) { return MkHRegGPR(21, mode64); }
+HReg hregPPC_GPR22 ( Bool mode64 ) { return MkHRegGPR(22, mode64); }
+HReg hregPPC_GPR23 ( Bool mode64 ) { return MkHRegGPR(23, mode64); }
+HReg hregPPC_GPR24 ( Bool mode64 ) { return MkHRegGPR(24, mode64); }
+HReg hregPPC_GPR25 ( Bool mode64 ) { return MkHRegGPR(25, mode64); }
+HReg hregPPC_GPR26 ( Bool mode64 ) { return MkHRegGPR(26, mode64); }
+HReg hregPPC_GPR27 ( Bool mode64 ) { return MkHRegGPR(27, mode64); }
+HReg hregPPC_GPR28 ( Bool mode64 ) { return MkHRegGPR(28, mode64); }
+HReg hregPPC_GPR29 ( Bool mode64 ) { return MkHRegGPR(29, mode64); }
+HReg hregPPC_GPR30 ( Bool mode64 ) { return MkHRegGPR(30, mode64); }
+HReg hregPPC_GPR31 ( Bool mode64 ) { return MkHRegGPR(31, mode64); }
+
+#undef MK_INT_HREG
+
+HReg hregPPC_FPR0  ( void ) { return mkHReg( 0, HRcFlt64, False); }
+HReg hregPPC_FPR1  ( void ) { return mkHReg( 1, HRcFlt64, False); }
+HReg hregPPC_FPR2  ( void ) { return mkHReg( 2, HRcFlt64, False); }
+HReg hregPPC_FPR3  ( void ) { return mkHReg( 3, HRcFlt64, False); }
+HReg hregPPC_FPR4  ( void ) { return mkHReg( 4, HRcFlt64, False); }
+HReg hregPPC_FPR5  ( void ) { return mkHReg( 5, HRcFlt64, False); }
+HReg hregPPC_FPR6  ( void ) { return mkHReg( 6, HRcFlt64, False); }
+HReg hregPPC_FPR7  ( void ) { return mkHReg( 7, HRcFlt64, False); }
+HReg hregPPC_FPR8  ( void ) { return mkHReg( 8, HRcFlt64, False); }
+HReg hregPPC_FPR9  ( void ) { return mkHReg( 9, HRcFlt64, False); }
+HReg hregPPC_FPR10 ( void ) { return mkHReg(10, HRcFlt64, False); }
+HReg hregPPC_FPR11 ( void ) { return mkHReg(11, HRcFlt64, False); }
+HReg hregPPC_FPR12 ( void ) { return mkHReg(12, HRcFlt64, False); }
+HReg hregPPC_FPR13 ( void ) { return mkHReg(13, HRcFlt64, False); }
+HReg hregPPC_FPR14 ( void ) { return mkHReg(14, HRcFlt64, False); }
+HReg hregPPC_FPR15 ( void ) { return mkHReg(15, HRcFlt64, False); }
+HReg hregPPC_FPR16 ( void ) { return mkHReg(16, HRcFlt64, False); }
+HReg hregPPC_FPR17 ( void ) { return mkHReg(17, HRcFlt64, False); }
+HReg hregPPC_FPR18 ( void ) { return mkHReg(18, HRcFlt64, False); }
+HReg hregPPC_FPR19 ( void ) { return mkHReg(19, HRcFlt64, False); }
+HReg hregPPC_FPR20 ( void ) { return mkHReg(20, HRcFlt64, False); }
+HReg hregPPC_FPR21 ( void ) { return mkHReg(21, HRcFlt64, False); }
+HReg hregPPC_FPR22 ( void ) { return mkHReg(22, HRcFlt64, False); }
+HReg hregPPC_FPR23 ( void ) { return mkHReg(23, HRcFlt64, False); }
+HReg hregPPC_FPR24 ( void ) { return mkHReg(24, HRcFlt64, False); }
+HReg hregPPC_FPR25 ( void ) { return mkHReg(25, HRcFlt64, False); }
+HReg hregPPC_FPR26 ( void ) { return mkHReg(26, HRcFlt64, False); }
+HReg hregPPC_FPR27 ( void ) { return mkHReg(27, HRcFlt64, False); }
+HReg hregPPC_FPR28 ( void ) { return mkHReg(28, HRcFlt64, False); }
+HReg hregPPC_FPR29 ( void ) { return mkHReg(29, HRcFlt64, False); }
+HReg hregPPC_FPR30 ( void ) { return mkHReg(30, HRcFlt64, False); }
+HReg hregPPC_FPR31 ( void ) { return mkHReg(31, HRcFlt64, False); }
+
+HReg hregPPC_VR0  ( void ) { return mkHReg( 0, HRcVec128, False); }
+HReg hregPPC_VR1  ( void ) { return mkHReg( 1, HRcVec128, False); }
+HReg hregPPC_VR2  ( void ) { return mkHReg( 2, HRcVec128, False); }
+HReg hregPPC_VR3  ( void ) { return mkHReg( 3, HRcVec128, False); }
+HReg hregPPC_VR4  ( void ) { return mkHReg( 4, HRcVec128, False); }
+HReg hregPPC_VR5  ( void ) { return mkHReg( 5, HRcVec128, False); }
+HReg hregPPC_VR6  ( void ) { return mkHReg( 6, HRcVec128, False); }
+HReg hregPPC_VR7  ( void ) { return mkHReg( 7, HRcVec128, False); }
+HReg hregPPC_VR8  ( void ) { return mkHReg( 8, HRcVec128, False); }
+HReg hregPPC_VR9  ( void ) { return mkHReg( 9, HRcVec128, False); }
+HReg hregPPC_VR10 ( void ) { return mkHReg(10, HRcVec128, False); }
+HReg hregPPC_VR11 ( void ) { return mkHReg(11, HRcVec128, False); }
+HReg hregPPC_VR12 ( void ) { return mkHReg(12, HRcVec128, False); }
+HReg hregPPC_VR13 ( void ) { return mkHReg(13, HRcVec128, False); }
+HReg hregPPC_VR14 ( void ) { return mkHReg(14, HRcVec128, False); }
+HReg hregPPC_VR15 ( void ) { return mkHReg(15, HRcVec128, False); }
+HReg hregPPC_VR16 ( void ) { return mkHReg(16, HRcVec128, False); }
+HReg hregPPC_VR17 ( void ) { return mkHReg(17, HRcVec128, False); }
+HReg hregPPC_VR18 ( void ) { return mkHReg(18, HRcVec128, False); }
+HReg hregPPC_VR19 ( void ) { return mkHReg(19, HRcVec128, False); }
+HReg hregPPC_VR20 ( void ) { return mkHReg(20, HRcVec128, False); }
+HReg hregPPC_VR21 ( void ) { return mkHReg(21, HRcVec128, False); }
+HReg hregPPC_VR22 ( void ) { return mkHReg(22, HRcVec128, False); }
+HReg hregPPC_VR23 ( void ) { return mkHReg(23, HRcVec128, False); }
+HReg hregPPC_VR24 ( void ) { return mkHReg(24, HRcVec128, False); }
+HReg hregPPC_VR25 ( void ) { return mkHReg(25, HRcVec128, False); }
+HReg hregPPC_VR26 ( void ) { return mkHReg(26, HRcVec128, False); }
+HReg hregPPC_VR27 ( void ) { return mkHReg(27, HRcVec128, False); }
+HReg hregPPC_VR28 ( void ) { return mkHReg(28, HRcVec128, False); }
+HReg hregPPC_VR29 ( void ) { return mkHReg(29, HRcVec128, False); }
+HReg hregPPC_VR30 ( void ) { return mkHReg(30, HRcVec128, False); }
+HReg hregPPC_VR31 ( void ) { return mkHReg(31, HRcVec128, False); }
+
+void getAllocableRegs_PPC ( Int* nregs, HReg** arr, Bool mode64 )
+{
+   UInt i=0;
+   if (mode64)
+      *nregs = (32-9) + (32-24) + (32-24);
+   else
+      *nregs = (32-7) + (32-24) + (32-24);
+   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
+   // GPR0 = scratch reg where poss. - some ops interpret as value zero
+   // GPR1 = stack pointer
+   // GPR2 = TOC pointer
+   (*arr)[i++] = hregPPC_GPR3(mode64);
+   (*arr)[i++] = hregPPC_GPR4(mode64);
+   (*arr)[i++] = hregPPC_GPR5(mode64);
+   (*arr)[i++] = hregPPC_GPR6(mode64);
+   (*arr)[i++] = hregPPC_GPR7(mode64);
+   (*arr)[i++] = hregPPC_GPR8(mode64);
+   (*arr)[i++] = hregPPC_GPR9(mode64);
+   (*arr)[i++] = hregPPC_GPR10(mode64);
+   if (!mode64) {
+      /* in mode64: 
+         r11 used for calls by ptr / env ptr for some langs
+         r12 used for exception handling and global linkage code */
+      (*arr)[i++] = hregPPC_GPR11(mode64);
+      (*arr)[i++] = hregPPC_GPR12(mode64);
+   }
+   // GPR13 = thread specific pointer
+   // GPR14 and above are callee save.  Yay.
+   (*arr)[i++] = hregPPC_GPR14(mode64);
+   (*arr)[i++] = hregPPC_GPR15(mode64);
+   (*arr)[i++] = hregPPC_GPR16(mode64);
+   (*arr)[i++] = hregPPC_GPR17(mode64);
+   (*arr)[i++] = hregPPC_GPR18(mode64);
+   (*arr)[i++] = hregPPC_GPR19(mode64);
+   (*arr)[i++] = hregPPC_GPR20(mode64);
+   (*arr)[i++] = hregPPC_GPR21(mode64);
+   (*arr)[i++] = hregPPC_GPR22(mode64);
+   (*arr)[i++] = hregPPC_GPR23(mode64);
+   (*arr)[i++] = hregPPC_GPR24(mode64);
+   (*arr)[i++] = hregPPC_GPR25(mode64);
+   (*arr)[i++] = hregPPC_GPR26(mode64);
+   (*arr)[i++] = hregPPC_GPR27(mode64);
+   (*arr)[i++] = hregPPC_GPR28(mode64);
+   // GPR29 is reserved for the dispatcher
+   // GPR30 is reserved as AltiVec spill reg temporary
+   // GPR31 is reserved for the GuestStatePtr
+
+   /* Don't waste the reg-allocs's time trawling through zillions of
+      FP registers - they mostly will never be used.  We'll tolerate
+      the occasional extra spill instead. */
+   /* For both ppc32-linux and ppc64-linux, f14-f31 are callee save.
+      So use them. */
+   (*arr)[i++] = hregPPC_FPR14();
+   (*arr)[i++] = hregPPC_FPR15();
+   (*arr)[i++] = hregPPC_FPR16();
+   (*arr)[i++] = hregPPC_FPR17();
+   (*arr)[i++] = hregPPC_FPR18();
+   (*arr)[i++] = hregPPC_FPR19();
+   (*arr)[i++] = hregPPC_FPR20();
+   (*arr)[i++] = hregPPC_FPR21();
+
+   /* Same deal re Altivec */
+   /* For both ppc32-linux and ppc64-linux, v20-v31 are callee save.
+      So use them. */
+   /* NB, vr29 is used as a scratch temporary -- do not allocate */
+   (*arr)[i++] = hregPPC_VR20();
+   (*arr)[i++] = hregPPC_VR21();
+   (*arr)[i++] = hregPPC_VR22();
+   (*arr)[i++] = hregPPC_VR23();
+   (*arr)[i++] = hregPPC_VR24();
+   (*arr)[i++] = hregPPC_VR25();
+   (*arr)[i++] = hregPPC_VR26();
+   (*arr)[i++] = hregPPC_VR27();
+
+   vassert(i == *nregs);
+}
+
+
+/* --------- Condition codes, Intel encoding. --------- */
+
+HChar* showPPCCondCode ( PPCCondCode cond )
+{
+   if (cond.test == Pct_ALWAYS) return "always";
+
+   switch (cond.flag) {
+   case Pcf_7SO:
+      return (cond.test == Pct_TRUE) ? "cr7.so=1" : "cr7.so=0";
+   case Pcf_7EQ:
+      return (cond.test == Pct_TRUE) ? "cr7.eq=1" : "cr7.eq=0";
+   case Pcf_7GT:
+      return (cond.test == Pct_TRUE) ? "cr7.gt=1" : "cr7.gt=0";
+   case Pcf_7LT:
+      return (cond.test == Pct_TRUE) ? "cr7.lt=1" : "cr7.lt=0";
+   default: vpanic("ppPPCCondCode");
+   }
+}
+
+/* construct condition code */
+PPCCondCode mk_PPCCondCode ( PPCCondTest test, PPCCondFlag flag )
+{
+   PPCCondCode cc;
+   cc.flag = flag;
+   cc.test = test;
+   return cc;
+}
+
+/* false->true, true->false */
+PPCCondTest invertCondTest ( PPCCondTest ct )
+{
+   vassert(ct != Pct_ALWAYS);
+   return (ct == Pct_TRUE) ? Pct_FALSE : Pct_TRUE;
+}
+
+
+/* --------- PPCAMode: memory address expressions. --------- */
+
+PPCAMode* PPCAMode_IR ( Int idx, HReg base ) {
+   PPCAMode* am = LibVEX_Alloc(sizeof(PPCAMode));
+   vassert(idx >= -0x8000 && idx < 0x8000);
+   am->tag = Pam_IR;
+   am->Pam.IR.base = base;
+   am->Pam.IR.index = idx;
+   return am;
+}
+PPCAMode* PPCAMode_RR ( HReg idx, HReg base ) {
+   PPCAMode* am = LibVEX_Alloc(sizeof(PPCAMode));
+   am->tag = Pam_RR;
+   am->Pam.RR.base = base;
+   am->Pam.RR.index = idx;
+   return am;
+}
+
+PPCAMode* dopyPPCAMode ( PPCAMode* am ) {
+   switch (am->tag) {
+   case Pam_IR: 
+      return PPCAMode_IR( am->Pam.IR.index, am->Pam.IR.base );
+   case Pam_RR: 
+      return PPCAMode_RR( am->Pam.RR.index, am->Pam.RR.base );
+   default:
+      vpanic("dopyPPCAMode");
+   }
+}
+
+void ppPPCAMode ( PPCAMode* am ) {
+   switch (am->tag) {
+   case Pam_IR: 
+      if (am->Pam.IR.index == 0)
+         vex_printf("0(");
+      else
+         vex_printf("%d(", (Int)am->Pam.IR.index);
+      ppHRegPPC(am->Pam.IR.base);
+      vex_printf(")");
+      return;
+   case Pam_RR:
+      ppHRegPPC(am->Pam.RR.base);
+      vex_printf(",");
+      ppHRegPPC(am->Pam.RR.index);
+      return;
+   default:
+      vpanic("ppPPCAMode");
+   }
+}
+
+static void addRegUsage_PPCAMode ( HRegUsage* u, PPCAMode* am ) {
+   switch (am->tag) {
+   case Pam_IR: 
+      addHRegUse(u, HRmRead, am->Pam.IR.base);
+      return;
+   case Pam_RR:
+      addHRegUse(u, HRmRead, am->Pam.RR.base);
+      addHRegUse(u, HRmRead, am->Pam.RR.index);
+      return;
+   default:
+      vpanic("addRegUsage_PPCAMode");
+   }
+}
+
+static void mapRegs_PPCAMode ( HRegRemap* m, PPCAMode* am ) {
+   switch (am->tag) {
+   case Pam_IR: 
+      am->Pam.IR.base = lookupHRegRemap(m, am->Pam.IR.base);
+      return;
+   case Pam_RR:
+      am->Pam.RR.base = lookupHRegRemap(m, am->Pam.RR.base);
+      am->Pam.RR.index = lookupHRegRemap(m, am->Pam.RR.index);
+      return;
+   default:
+      vpanic("mapRegs_PPCAMode");
+   }
+}
+
+/* --------- Operand, which can be a reg or a u16/s16. --------- */
+
+PPCRH* PPCRH_Imm ( Bool syned, UShort imm16 ) {
+   PPCRH* op         = LibVEX_Alloc(sizeof(PPCRH));
+   op->tag           = Prh_Imm;
+   op->Prh.Imm.syned = syned;
+   op->Prh.Imm.imm16 = imm16;
+   /* If this is a signed value, ensure it's not -32768, so that we
+      are guaranteed always to be able to negate if needed. */
+   if (syned)
+      vassert(imm16 != 0x8000);
+   vassert(syned == True || syned == False);
+   return op;
+}
+PPCRH* PPCRH_Reg ( HReg reg ) {
+   PPCRH* op       = LibVEX_Alloc(sizeof(PPCRH));
+   op->tag         = Prh_Reg;
+   op->Prh.Reg.reg = reg;
+   return op;
+}
+
+void ppPPCRH ( PPCRH* op ) {
+   switch (op->tag) {
+   case Prh_Imm: 
+      if (op->Prh.Imm.syned)
+         vex_printf("%d", (Int)(Short)op->Prh.Imm.imm16);
+      else
+         vex_printf("%u", (UInt)(UShort)op->Prh.Imm.imm16);
+      return;
+   case Prh_Reg: 
+      ppHRegPPC(op->Prh.Reg.reg);
+      return;
+   default: 
+      vpanic("ppPPCRH");
+   }
+}
+
+/* An PPCRH can only be used in a "read" context (what would it mean
+   to write or modify a literal?) and so we enumerate its registers
+   accordingly. */
+static void addRegUsage_PPCRH ( HRegUsage* u, PPCRH* op ) {
+   switch (op->tag) {
+   case Prh_Imm: 
+      return;
+   case Prh_Reg: 
+      addHRegUse(u, HRmRead, op->Prh.Reg.reg);
+      return;
+   default: 
+      vpanic("addRegUsage_PPCRH");
+   }
+}
+
+static void mapRegs_PPCRH ( HRegRemap* m, PPCRH* op ) {
+   switch (op->tag) {
+   case Prh_Imm: 
+      return;
+   case Prh_Reg: 
+      op->Prh.Reg.reg = lookupHRegRemap(m, op->Prh.Reg.reg);
+      return;
+   default: 
+      vpanic("mapRegs_PPCRH");
+   }
+}
+
+
+/* --------- Operand, which can be a reg or a u32/64. --------- */
+
+PPCRI* PPCRI_Imm ( ULong imm64 ) {
+   PPCRI* op   = LibVEX_Alloc(sizeof(PPCRI));
+   op->tag     = Pri_Imm;
+   op->Pri.Imm = imm64;
+   return op;
+}
+PPCRI* PPCRI_Reg ( HReg reg ) {
+   PPCRI* op   = LibVEX_Alloc(sizeof(PPCRI));
+   op->tag     = Pri_Reg;
+   op->Pri.Reg = reg;
+   return op;
+}
+
+void ppPPCRI ( PPCRI* dst ) {
+   switch (dst->tag) {
+      case Pri_Imm: 
+         vex_printf("0x%llx", dst->Pri.Imm);
+         break;
+      case Pri_Reg: 
+         ppHRegPPC(dst->Pri.Reg);
+         break;
+      default: 
+         vpanic("ppPPCRI");
+   }
+}
+
+/* An PPCRI can only be used in a "read" context (what would it
+   mean to write or modify a literal?) and so we enumerate its
+   registers accordingly. */
+static void addRegUsage_PPCRI ( HRegUsage* u, PPCRI* dst ) {
+   switch (dst->tag) {
+      case Pri_Imm: 
+         return;
+      case Pri_Reg: 
+         addHRegUse(u, HRmRead, dst->Pri.Reg);
+         return;
+      default: 
+         vpanic("addRegUsage_PPCRI");
+   }
+}
+
+static void mapRegs_PPCRI ( HRegRemap* m, PPCRI* dst ) {
+   switch (dst->tag) {
+      case Pri_Imm: 
+         return;
+      case Pri_Reg: 
+         dst->Pri.Reg = lookupHRegRemap(m, dst->Pri.Reg);
+         return;
+      default: 
+         vpanic("mapRegs_PPCRI");
+   }
+}
+
+
+/* --------- Operand, which can be a vector reg or a simm5. --------- */
+
+PPCVI5s* PPCVI5s_Imm ( Char simm5 ) {
+   PPCVI5s* op   = LibVEX_Alloc(sizeof(PPCVI5s));
+   op->tag       = Pvi_Imm;
+   op->Pvi.Imm5s = simm5;
+   vassert(simm5 >= -16 && simm5 <= 15);
+   return op;
+}
+PPCVI5s* PPCVI5s_Reg ( HReg reg ) {
+   PPCVI5s* op = LibVEX_Alloc(sizeof(PPCVI5s));
+   op->tag     = Pvi_Reg;
+   op->Pvi.Reg = reg;
+   vassert(hregClass(reg) == HRcVec128);
+   return op;
+}
+
+void ppPPCVI5s ( PPCVI5s* src ) {
+   switch (src->tag) {
+      case Pvi_Imm: 
+         vex_printf("%d", (Int)src->Pvi.Imm5s);
+         break;
+      case Pvi_Reg: 
+         ppHRegPPC(src->Pvi.Reg);
+         break;
+      default: 
+         vpanic("ppPPCVI5s");
+   }
+}
+
+/* An PPCVI5s can only be used in a "read" context (what would it
+   mean to write or modify a literal?) and so we enumerate its
+   registers accordingly. */
+static void addRegUsage_PPCVI5s ( HRegUsage* u, PPCVI5s* dst ) {
+   switch (dst->tag) {
+      case Pvi_Imm: 
+         return;
+      case Pvi_Reg: 
+         addHRegUse(u, HRmRead, dst->Pvi.Reg);
+         return;
+      default: 
+         vpanic("addRegUsage_PPCVI5s");
+   }
+}
+
+static void mapRegs_PPCVI5s ( HRegRemap* m, PPCVI5s* dst ) {
+   switch (dst->tag) {
+      case Pvi_Imm: 
+         return;
+      case Pvi_Reg: 
+         dst->Pvi.Reg = lookupHRegRemap(m, dst->Pvi.Reg);
+         return;
+      default: 
+         vpanic("mapRegs_PPCVI5s");
+   }
+}
+
+
+/* --------- Instructions. --------- */
+
+HChar* showPPCUnaryOp ( PPCUnaryOp op ) {
+   switch (op) {
+   case Pun_NOT:   return "not";
+   case Pun_NEG:   return "neg";
+   case Pun_CLZ32: return "cntlzw";
+   case Pun_CLZ64: return "cntlzd";
+   case Pun_EXTSW: return "extsw";
+   default: vpanic("showPPCUnaryOp");
+   }
+}
+
+HChar* showPPCAluOp ( PPCAluOp op, Bool immR ) {
+   switch (op) {
+      case Palu_ADD: return immR ? "addi"  : "add";
+      case Palu_SUB: return immR ? "subi"  : "sub";
+      case Palu_AND: return immR ? "andi." : "and";
+      case Palu_OR:  return immR ? "ori"   : "or";
+      case Palu_XOR: return immR ? "xori"  : "xor";
+      default: vpanic("showPPCAluOp");
+   }
+}
+
+HChar* showPPCShftOp ( PPCShftOp op, Bool immR, Bool sz32 ) {
+   switch (op) {
+      case Pshft_SHL: return sz32 ? (immR ? "slwi"  : "slw") : 
+                                    (immR ? "sldi"  : "sld");
+      case Pshft_SHR: return sz32 ? (immR ? "srwi"  : "srw") :
+                                    (immR ? "srdi"  : "srd");
+      case Pshft_SAR: return sz32 ? (immR ? "srawi" : "sraw") :
+                                    (immR ? "sradi" : "srad");
+      default: vpanic("showPPCShftOp");
+   }
+}
+
+HChar* showPPCFpOp ( PPCFpOp op ) {
+   switch (op) {
+      case Pfp_ADDD:   return "fadd";
+      case Pfp_SUBD:   return "fsub";
+      case Pfp_MULD:   return "fmul";
+      case Pfp_DIVD:   return "fdiv";
+      case Pfp_MADDD:  return "fmadd";
+      case Pfp_MSUBD:  return "fmsub";
+      case Pfp_MADDS:  return "fmadds";
+      case Pfp_MSUBS:  return "fmsubs";
+      case Pfp_ADDS:   return "fadds";
+      case Pfp_SUBS:   return "fsubs";
+      case Pfp_MULS:   return "fmuls";
+      case Pfp_DIVS:   return "fdivs";
+      case Pfp_SQRT:   return "fsqrt";
+      case Pfp_ABS:    return "fabs";
+      case Pfp_NEG:    return "fneg";
+      case Pfp_MOV:    return "fmr";
+      case Pfp_RES:    return "fres";
+      case Pfp_RSQRTE: return "frsqrte";
+      case Pfp_FRIM:   return "frim";
+      case Pfp_FRIN:   return "frin";
+      case Pfp_FRIP:   return "frip";
+      case Pfp_FRIZ:   return "friz";
+      default: vpanic("showPPCFpOp");
+   }
+}
+
+HChar* showPPCAvOp ( PPCAvOp op ) {
+   switch (op) {
+
+   /* Unary */
+   case Pav_MOV:       return "vmr";      /* Mov */
+     
+   case Pav_AND:       return "vand";     /* Bitwise */
+   case Pav_OR:        return "vor";
+   case Pav_XOR:       return "vxor";
+   case Pav_NOT:       return "vnot";
+
+   case Pav_UNPCKH8S:  return "vupkhsb";  /* Unpack */
+   case Pav_UNPCKH16S: return "vupkhsh";
+   case Pav_UNPCKL8S:  return "vupklsb";
+   case Pav_UNPCKL16S: return "vupklsh";
+   case Pav_UNPCKHPIX: return "vupkhpx";
+   case Pav_UNPCKLPIX: return "vupklpx";
+
+   /* Integer binary */
+   case Pav_ADDU:      return "vaddu_m";  // b,h,w
+   case Pav_QADDU:     return "vaddu_s";  // b,h,w
+   case Pav_QADDS:     return "vadds_s";  // b,h,w
+     
+   case Pav_SUBU:      return "vsubu_m";  // b,h,w
+   case Pav_QSUBU:     return "vsubu_s";  // b,h,w
+   case Pav_QSUBS:     return "vsubs_s";  // b,h,w
+     
+   case Pav_OMULU:     return "vmulou";   // b,h
+   case Pav_OMULS:     return "vmulos";   // b,h
+   case Pav_EMULU:     return "vmuleu";   // b,h
+   case Pav_EMULS:     return "vmules";   // b,h
+  
+   case Pav_AVGU:      return "vavgu";    // b,h,w
+   case Pav_AVGS:      return "vavgs";    // b,h,w
+     
+   case Pav_MAXU:      return "vmaxu";    // b,h,w
+   case Pav_MAXS:      return "vmaxs";    // b,h,w
+     
+   case Pav_MINU:      return "vminu";    // b,h,w
+   case Pav_MINS:      return "vmins";    // b,h,w
+     
+   /* Compare (always affects CR field 6) */
+   case Pav_CMPEQU:    return "vcmpequ";  // b,h,w
+   case Pav_CMPGTU:    return "vcmpgtu";  // b,h,w
+   case Pav_CMPGTS:    return "vcmpgts";  // b,h,w
+
+   /* Shift */
+   case Pav_SHL:       return "vsl";      // ' ',b,h,w
+   case Pav_SHR:       return "vsr";      // ' ',b,h,w
+   case Pav_SAR:       return "vsra";     // b,h,w
+   case Pav_ROTL:      return "vrl";      // b,h,w
+
+   /* Pack */
+   case Pav_PACKUU:    return "vpku_um";  // h,w
+   case Pav_QPACKUU:   return "vpku_us";  // h,w
+   case Pav_QPACKSU:   return "vpks_us";  // h,w
+   case Pav_QPACKSS:   return "vpks_ss";  // h,w
+   case Pav_PACKPXL:   return "vpkpx";
+
+   /* Merge */
+   case Pav_MRGHI:     return "vmrgh";    // b,h,w
+   case Pav_MRGLO:     return "vmrgl";    // b,h,w
+
+   default: vpanic("showPPCAvOp");
+   }
+}
+
+HChar* showPPCAvFpOp ( PPCAvFpOp op ) {
+   switch (op) {
+   /* Floating Point Binary */
+   case Pavfp_ADDF:      return "vaddfp";
+   case Pavfp_SUBF:      return "vsubfp";
+   case Pavfp_MULF:      return "vmaddfp";
+   case Pavfp_MAXF:      return "vmaxfp";
+   case Pavfp_MINF:      return "vminfp";
+   case Pavfp_CMPEQF:    return "vcmpeqfp";
+   case Pavfp_CMPGTF:    return "vcmpgtfp";
+   case Pavfp_CMPGEF:    return "vcmpgefp";
+     
+   /* Floating Point Unary */
+   case Pavfp_RCPF:      return "vrefp";
+   case Pavfp_RSQRTF:    return "vrsqrtefp";
+   case Pavfp_CVTU2F:    return "vcfux";
+   case Pavfp_CVTS2F:    return "vcfsx";
+   case Pavfp_QCVTF2U:   return "vctuxs";
+   case Pavfp_QCVTF2S:   return "vctsxs";
+   case Pavfp_ROUNDM:    return "vrfim";
+   case Pavfp_ROUNDP:    return "vrfip";
+   case Pavfp_ROUNDN:    return "vrfin";
+   case Pavfp_ROUNDZ:    return "vrfiz";
+
+   default: vpanic("showPPCAvFpOp");
+   }
+}
+
+PPCInstr* PPCInstr_LI ( HReg dst, ULong imm64, Bool mode64 )
+{
+   PPCInstr* i     = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag          = Pin_LI;
+   i->Pin.LI.dst   = dst;
+   i->Pin.LI.imm64 = imm64;
+   if (!mode64)
+      vassert( (Long)imm64 == (Long)(Int)(UInt)imm64 );
+   return i;
+}
+PPCInstr* PPCInstr_Alu ( PPCAluOp op, HReg dst, 
+                         HReg srcL, PPCRH* srcR ) {
+   PPCInstr* i     = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag          = Pin_Alu;
+   i->Pin.Alu.op   = op;
+   i->Pin.Alu.dst  = dst;
+   i->Pin.Alu.srcL = srcL;
+   i->Pin.Alu.srcR = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_Shft ( PPCShftOp op, Bool sz32, 
+                          HReg dst, HReg srcL, PPCRH* srcR ) {
+   PPCInstr* i      = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag           = Pin_Shft;
+   i->Pin.Shft.op   = op;
+   i->Pin.Shft.sz32 = sz32;
+   i->Pin.Shft.dst  = dst;
+   i->Pin.Shft.srcL = srcL;
+   i->Pin.Shft.srcR = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_AddSubC ( Bool isAdd, Bool setC,
+                             HReg dst, HReg srcL, HReg srcR ) {
+   PPCInstr* i          = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag               = Pin_AddSubC;
+   i->Pin.AddSubC.isAdd = isAdd;
+   i->Pin.AddSubC.setC  = setC;
+   i->Pin.AddSubC.dst   = dst;
+   i->Pin.AddSubC.srcL  = srcL;
+   i->Pin.AddSubC.srcR  = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_Cmp ( Bool syned, Bool sz32, 
+                         UInt crfD, HReg srcL, PPCRH* srcR ) {
+   PPCInstr* i      = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag           = Pin_Cmp;
+   i->Pin.Cmp.syned = syned;
+   i->Pin.Cmp.sz32  = sz32;
+   i->Pin.Cmp.crfD  = crfD;
+   i->Pin.Cmp.srcL  = srcL;
+   i->Pin.Cmp.srcR  = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_Unary ( PPCUnaryOp op, HReg dst, HReg src ) {
+   PPCInstr* i      = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag           = Pin_Unary;
+   i->Pin.Unary.op  = op;
+   i->Pin.Unary.dst = dst;
+   i->Pin.Unary.src = src;
+   return i;
+}
+PPCInstr* PPCInstr_MulL ( Bool syned, Bool hi, Bool sz32, 
+                          HReg dst, HReg srcL, HReg srcR ) {
+   PPCInstr* i       = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag            = Pin_MulL;
+   i->Pin.MulL.syned = syned;
+   i->Pin.MulL.hi    = hi;
+   i->Pin.MulL.sz32  = sz32;
+   i->Pin.MulL.dst   = dst;
+   i->Pin.MulL.srcL  = srcL;
+   i->Pin.MulL.srcR  = srcR;
+   /* if doing the low word, the signedness is irrelevant, but tie it
+      down anyway. */
+   if (!hi) vassert(!syned);
+   return i;
+}
+PPCInstr* PPCInstr_Div ( Bool syned, Bool sz32,
+                         HReg dst, HReg srcL, HReg srcR ) {
+   PPCInstr* i      = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag           = Pin_Div;
+   i->Pin.Div.syned = syned;
+   i->Pin.Div.sz32  = sz32;
+   i->Pin.Div.dst   = dst;
+   i->Pin.Div.srcL  = srcL;
+   i->Pin.Div.srcR  = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_Call ( PPCCondCode cond, 
+                          Addr64 target, UInt argiregs ) {
+   UInt mask;
+   PPCInstr* i          = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag               = Pin_Call;
+   i->Pin.Call.cond     = cond;
+   i->Pin.Call.target   = target;
+   i->Pin.Call.argiregs = argiregs;
+   /* Only r3 .. r10 inclusive may be used as arg regs. Hence: */
+   mask = (1<<3)|(1<<4)|(1<<5)|(1<<6)|(1<<7)|(1<<8)|(1<<9)|(1<<10);
+   vassert(0 == (argiregs & ~mask));
+   return i;
+}
+PPCInstr* PPCInstr_Goto ( IRJumpKind jk, 
+                          PPCCondCode cond, PPCRI* dst ) {
+   PPCInstr* i      = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag           = Pin_Goto;
+   i->Pin.Goto.cond = cond;
+   i->Pin.Goto.dst  = dst;
+   i->Pin.Goto.jk   = jk;
+   return i;
+}
+PPCInstr* PPCInstr_CMov  ( PPCCondCode cond, 
+                           HReg dst, PPCRI* src ) {
+   PPCInstr* i      = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag           = Pin_CMov;
+   i->Pin.CMov.cond = cond;
+   i->Pin.CMov.src  = src;
+   i->Pin.CMov.dst  = dst;
+   vassert(cond.test != Pct_ALWAYS);
+   return i;
+}
+PPCInstr* PPCInstr_Load ( UChar sz,
+                          HReg dst, PPCAMode* src, Bool mode64 ) {
+   PPCInstr* i       = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag            = Pin_Load;
+   i->Pin.Load.sz    = sz;
+   i->Pin.Load.src   = src;
+   i->Pin.Load.dst   = dst;
+   vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
+   if (sz == 8) vassert(mode64);
+   return i;
+}
+PPCInstr* PPCInstr_LoadL ( UChar sz,
+                           HReg dst, HReg src, Bool mode64 )
+{
+   PPCInstr* i       = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag            = Pin_LoadL;
+   i->Pin.LoadL.sz   = sz;
+   i->Pin.LoadL.src  = src;
+   i->Pin.LoadL.dst  = dst;
+   vassert(sz == 4 || sz == 8);
+   if (sz == 8) vassert(mode64);
+   return i;
+}
+PPCInstr* PPCInstr_Store ( UChar sz, PPCAMode* dst, HReg src,
+                           Bool mode64 ) {
+   PPCInstr* i      = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag           = Pin_Store;
+   i->Pin.Store.sz  = sz;
+   i->Pin.Store.src = src;
+   i->Pin.Store.dst = dst;
+   vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
+   if (sz == 8) vassert(mode64);
+   return i;
+}
+PPCInstr* PPCInstr_StoreC ( UChar sz, HReg dst, HReg src, Bool mode64 ) {
+   PPCInstr* i       = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag            = Pin_StoreC;
+   i->Pin.StoreC.sz  = sz;
+   i->Pin.StoreC.src = src;
+   i->Pin.StoreC.dst = dst;
+   vassert(sz == 4 || sz == 8);
+   if (sz == 8) vassert(mode64);
+   return i;
+}
+PPCInstr* PPCInstr_Set ( PPCCondCode cond, HReg dst ) {
+   PPCInstr* i     = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag          = Pin_Set;
+   i->Pin.Set.cond = cond;
+   i->Pin.Set.dst  = dst;
+   return i;
+}
+PPCInstr* PPCInstr_MfCR ( HReg dst )
+{
+   PPCInstr* i     = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag          = Pin_MfCR;
+   i->Pin.MfCR.dst = dst;
+   return i;
+}
+PPCInstr* PPCInstr_MFence ( void )
+{
+   PPCInstr* i = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag      = Pin_MFence;
+   return i;
+}
+
+PPCInstr* PPCInstr_FpUnary ( PPCFpOp op, HReg dst, HReg src ) {
+   PPCInstr* i        = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag             = Pin_FpUnary;
+   i->Pin.FpUnary.op  = op;
+   i->Pin.FpUnary.dst = dst;
+   i->Pin.FpUnary.src = src;
+   return i;
+}
+PPCInstr* PPCInstr_FpBinary ( PPCFpOp op, HReg dst,
+                              HReg srcL, HReg srcR ) {
+   PPCInstr* i          = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag               = Pin_FpBinary;
+   i->Pin.FpBinary.op   = op;
+   i->Pin.FpBinary.dst  = dst;
+   i->Pin.FpBinary.srcL = srcL;
+   i->Pin.FpBinary.srcR = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_FpMulAcc ( PPCFpOp op, HReg dst, HReg srcML, 
+                                          HReg srcMR, HReg srcAcc )
+{
+   PPCInstr* i            = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag                 = Pin_FpMulAcc;
+   i->Pin.FpMulAcc.op     = op;
+   i->Pin.FpMulAcc.dst    = dst;
+   i->Pin.FpMulAcc.srcML  = srcML;
+   i->Pin.FpMulAcc.srcMR  = srcMR;
+   i->Pin.FpMulAcc.srcAcc = srcAcc;
+   return i;
+}
+PPCInstr* PPCInstr_FpLdSt ( Bool isLoad, UChar sz,
+                            HReg reg, PPCAMode* addr ) {
+   PPCInstr* i          = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag               = Pin_FpLdSt;
+   i->Pin.FpLdSt.isLoad = isLoad;
+   i->Pin.FpLdSt.sz     = sz;
+   i->Pin.FpLdSt.reg    = reg;
+   i->Pin.FpLdSt.addr   = addr;
+   vassert(sz == 4 || sz == 8);
+   return i;
+}
+PPCInstr* PPCInstr_FpSTFIW ( HReg addr, HReg data )
+{
+   PPCInstr* i         = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag              = Pin_FpSTFIW;
+   i->Pin.FpSTFIW.addr = addr;
+   i->Pin.FpSTFIW.data = data;
+   return i;
+}
+PPCInstr* PPCInstr_FpRSP ( HReg dst, HReg src ) {
+   PPCInstr* i      = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag           = Pin_FpRSP;
+   i->Pin.FpRSP.dst = dst;
+   i->Pin.FpRSP.src = src;
+   return i;
+}
+PPCInstr* PPCInstr_FpCftI ( Bool fromI, Bool int32, 
+                            HReg dst, HReg src ) {
+   PPCInstr* i         = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag              = Pin_FpCftI;
+   i->Pin.FpCftI.fromI = fromI;
+   i->Pin.FpCftI.int32 = int32;
+   i->Pin.FpCftI.dst   = dst;
+   i->Pin.FpCftI.src   = src;
+   vassert(!(int32 && fromI)); /* no such insn ("fcfiw"). */
+   return i;
+}
+PPCInstr* PPCInstr_FpCMov ( PPCCondCode cond, HReg dst, HReg src ) {
+   PPCInstr* i        = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag             = Pin_FpCMov;
+   i->Pin.FpCMov.cond = cond;
+   i->Pin.FpCMov.dst  = dst;
+   i->Pin.FpCMov.src  = src;
+   vassert(cond.test != Pct_ALWAYS);
+   return i;
+}
+PPCInstr* PPCInstr_FpLdFPSCR ( HReg src ) {
+   PPCInstr* i          = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag               = Pin_FpLdFPSCR;
+   i->Pin.FpLdFPSCR.src = src;
+   return i;
+}
+PPCInstr* PPCInstr_FpCmp ( HReg dst, HReg srcL, HReg srcR ) {
+   PPCInstr* i       = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag            = Pin_FpCmp;
+   i->Pin.FpCmp.dst  = dst;
+   i->Pin.FpCmp.srcL = srcL;
+   i->Pin.FpCmp.srcR = srcR;
+   return i;
+}
+
+/* Read/Write Link Register */
+PPCInstr* PPCInstr_RdWrLR ( Bool wrLR, HReg gpr ) {
+   PPCInstr* i        = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag             = Pin_RdWrLR;
+   i->Pin.RdWrLR.wrLR = wrLR;
+   i->Pin.RdWrLR.gpr  = gpr;
+   return i;
+}
+
+/* AltiVec */
+PPCInstr* PPCInstr_AvLdSt ( Bool isLoad, UChar sz,
+                            HReg reg, PPCAMode* addr ) {
+   PPCInstr* i          = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag               = Pin_AvLdSt;
+   i->Pin.AvLdSt.isLoad = isLoad;
+   i->Pin.AvLdSt.sz     = sz;
+   i->Pin.AvLdSt.reg    = reg;
+   i->Pin.AvLdSt.addr   = addr;
+   return i;
+}
+PPCInstr* PPCInstr_AvUnary ( PPCAvOp op, HReg dst, HReg src ) {
+   PPCInstr* i        = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag             = Pin_AvUnary;
+   i->Pin.AvUnary.op  = op;
+   i->Pin.AvUnary.dst = dst;
+   i->Pin.AvUnary.src = src;
+   return i;
+}
+PPCInstr* PPCInstr_AvBinary ( PPCAvOp op, HReg dst,
+                              HReg srcL, HReg srcR ) {
+   PPCInstr* i          = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag               = Pin_AvBinary;
+   i->Pin.AvBinary.op   = op;
+   i->Pin.AvBinary.dst  = dst;
+   i->Pin.AvBinary.srcL = srcL;
+   i->Pin.AvBinary.srcR = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_AvBin8x16 ( PPCAvOp op, HReg dst,
+                               HReg srcL, HReg srcR ) {
+   PPCInstr* i           = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag                = Pin_AvBin8x16;
+   i->Pin.AvBin8x16.op   = op;
+   i->Pin.AvBin8x16.dst  = dst;
+   i->Pin.AvBin8x16.srcL = srcL;
+   i->Pin.AvBin8x16.srcR = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_AvBin16x8 ( PPCAvOp op, HReg dst,
+                               HReg srcL, HReg srcR ) {
+   PPCInstr* i           = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag                = Pin_AvBin16x8;
+   i->Pin.AvBin16x8.op   = op;
+   i->Pin.AvBin16x8.dst  = dst;
+   i->Pin.AvBin16x8.srcL = srcL;
+   i->Pin.AvBin16x8.srcR = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_AvBin32x4 ( PPCAvOp op, HReg dst,
+                               HReg srcL, HReg srcR ) {
+   PPCInstr* i           = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag                = Pin_AvBin32x4;
+   i->Pin.AvBin32x4.op   = op;
+   i->Pin.AvBin32x4.dst  = dst;
+   i->Pin.AvBin32x4.srcL = srcL;
+   i->Pin.AvBin32x4.srcR = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_AvBin32Fx4 ( PPCAvOp op, HReg dst,
+                                HReg srcL, HReg srcR ) {
+   PPCInstr* i            = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag                 = Pin_AvBin32Fx4;
+   i->Pin.AvBin32Fx4.op   = op;
+   i->Pin.AvBin32Fx4.dst  = dst;
+   i->Pin.AvBin32Fx4.srcL = srcL;
+   i->Pin.AvBin32Fx4.srcR = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_AvUn32Fx4 ( PPCAvOp op, HReg dst, HReg src ) {
+   PPCInstr* i          = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag               = Pin_AvUn32Fx4;
+   i->Pin.AvUn32Fx4.op  = op;
+   i->Pin.AvUn32Fx4.dst = dst;
+   i->Pin.AvUn32Fx4.src = src;
+   return i;
+}
+PPCInstr* PPCInstr_AvPerm ( HReg dst, HReg srcL, HReg srcR, HReg ctl ) {
+   PPCInstr* i        = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag             = Pin_AvPerm;
+   i->Pin.AvPerm.dst  = dst;
+   i->Pin.AvPerm.srcL = srcL;
+   i->Pin.AvPerm.srcR = srcR;
+   i->Pin.AvPerm.ctl  = ctl;
+   return i;
+}
+PPCInstr* PPCInstr_AvSel ( HReg ctl, HReg dst, HReg srcL, HReg srcR ) {
+   PPCInstr* i       = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag            = Pin_AvSel;
+   i->Pin.AvSel.ctl  = ctl;
+   i->Pin.AvSel.dst  = dst;
+   i->Pin.AvSel.srcL = srcL;
+   i->Pin.AvSel.srcR = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_AvShlDbl ( UChar shift, HReg dst,
+                              HReg srcL, HReg srcR ) {
+   PPCInstr* i           = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag                = Pin_AvShlDbl;
+   i->Pin.AvShlDbl.shift = shift;
+   i->Pin.AvShlDbl.dst   = dst;
+   i->Pin.AvShlDbl.srcL  = srcL;
+   i->Pin.AvShlDbl.srcR  = srcR;
+   return i;
+}
+PPCInstr* PPCInstr_AvSplat ( UChar sz, HReg dst, PPCVI5s* src ) {
+   PPCInstr* i        = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag             = Pin_AvSplat;
+   i->Pin.AvSplat.sz  = sz;
+   i->Pin.AvSplat.dst = dst;
+   i->Pin.AvSplat.src = src;
+   return i;
+}
+PPCInstr* PPCInstr_AvCMov ( PPCCondCode cond, HReg dst, HReg src ) {
+   PPCInstr* i        = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag             = Pin_AvCMov;
+   i->Pin.AvCMov.cond = cond;
+   i->Pin.AvCMov.dst  = dst;
+   i->Pin.AvCMov.src  = src;
+   vassert(cond.test != Pct_ALWAYS);
+   return i;
+}
+PPCInstr* PPCInstr_AvLdVSCR ( HReg src ) {
+   PPCInstr* i         = LibVEX_Alloc(sizeof(PPCInstr));
+   i->tag              = Pin_AvLdVSCR;
+   i->Pin.AvLdVSCR.src = src;
+   return i;
+}
+
+
+/* Pretty Print instructions */
+static void ppLoadImm ( HReg dst, ULong imm, Bool mode64 ) {
+   vex_printf("li_word ");
+   ppHRegPPC(dst);
+   if (!mode64) {
+      vex_printf(",0x%08x", (UInt)imm);
+   } else {
+      vex_printf(",0x%016llx", imm);
+   }
+}
+
+static void ppMovReg ( HReg dst, HReg src ) {
+   if (hregNumber(dst) != hregNumber(src)) {
+      vex_printf("mr ");
+      ppHRegPPC(dst);
+      vex_printf(",");
+      ppHRegPPC(src);
+   }
+}
+
+void ppPPCInstr ( PPCInstr* i, Bool mode64 )
+{
+   switch (i->tag) {
+   case Pin_LI:
+      ppLoadImm(i->Pin.LI.dst, i->Pin.LI.imm64, mode64);
+      break;
+   case Pin_Alu: {
+      HReg   r_srcL  = i->Pin.Alu.srcL;
+      PPCRH* rh_srcR = i->Pin.Alu.srcR;
+      /* special-case "mr" */
+      if (i->Pin.Alu.op == Palu_OR &&   // or Rd,Rs,Rs == mr Rd,Rs
+          rh_srcR->tag == Prh_Reg &&
+          rh_srcR->Prh.Reg.reg == r_srcL) {
+         vex_printf("mr ");
+         ppHRegPPC(i->Pin.Alu.dst);
+         vex_printf(",");
+         ppHRegPPC(r_srcL);
+         return;
+      }
+      /* special-case "li" */
+      if (i->Pin.Alu.op == Palu_ADD &&   // addi Rd,0,imm == li Rd,imm
+          rh_srcR->tag == Prh_Imm &&
+          hregNumber(r_srcL) == 0) {
+         vex_printf("li ");
+         ppHRegPPC(i->Pin.Alu.dst);
+         vex_printf(",");
+         ppPPCRH(rh_srcR);
+         return;
+      }
+      /* generic */
+      vex_printf("%s ", showPPCAluOp(i->Pin.Alu.op,
+                                     toBool(rh_srcR->tag == Prh_Imm)));
+      ppHRegPPC(i->Pin.Alu.dst);
+      vex_printf(",");
+      ppHRegPPC(r_srcL);
+      vex_printf(",");
+      ppPPCRH(rh_srcR);
+      return;
+   }
+   case Pin_Shft: {
+      HReg   r_srcL  = i->Pin.Shft.srcL;
+      PPCRH* rh_srcR = i->Pin.Shft.srcR;
+      vex_printf("%s ", showPPCShftOp(i->Pin.Shft.op,
+                                      toBool(rh_srcR->tag == Prh_Imm),
+                                      i->Pin.Shft.sz32));
+      ppHRegPPC(i->Pin.Shft.dst);
+      vex_printf(",");
+      ppHRegPPC(r_srcL);
+      vex_printf(",");
+      ppPPCRH(rh_srcR);
+      return;
+   }
+   case Pin_AddSubC:
+      vex_printf("%s%s ",
+                 i->Pin.AddSubC.isAdd ? "add" : "sub",
+                 i->Pin.AddSubC.setC ? "c" : "e");
+      ppHRegPPC(i->Pin.AddSubC.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AddSubC.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AddSubC.srcR);
+      return;
+   case Pin_Cmp:
+      vex_printf("%s%c%s %%cr%u,",
+                 i->Pin.Cmp.syned ? "cmp" : "cmpl",
+                 i->Pin.Cmp.sz32 ? 'w' : 'd',
+                 i->Pin.Cmp.srcR->tag == Prh_Imm ? "i" : "",
+                 i->Pin.Cmp.crfD);
+      ppHRegPPC(i->Pin.Cmp.srcL);
+      vex_printf(",");
+      ppPPCRH(i->Pin.Cmp.srcR);
+      return;
+   case Pin_Unary:
+      vex_printf("%s ", showPPCUnaryOp(i->Pin.Unary.op));
+      ppHRegPPC(i->Pin.Unary.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.Unary.src);
+      return;
+   case Pin_MulL:
+      vex_printf("mul%c%c%s ",
+                 i->Pin.MulL.hi ? 'h' : 'l',
+                 i->Pin.MulL.sz32 ? 'w' : 'd',
+                 i->Pin.MulL.hi ? (i->Pin.MulL.syned ? "s" : "u") : "");
+      ppHRegPPC(i->Pin.MulL.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.MulL.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.MulL.srcR);
+      return;
+   case Pin_Div:
+      vex_printf("div%c%s ",
+                 i->Pin.Div.sz32 ? 'w' : 'd',
+                 i->Pin.Div.syned ? "" : "u");
+      ppHRegPPC(i->Pin.Div.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.Div.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.Div.srcR);
+      return;
+   case Pin_Call: {
+      Int n;
+      vex_printf("call: ");
+      if (i->Pin.Call.cond.test != Pct_ALWAYS) {
+         vex_printf("if (%s) ", showPPCCondCode(i->Pin.Call.cond));
+      }
+      vex_printf("{ ");
+      ppLoadImm(hregPPC_GPR10(mode64), i->Pin.Call.target, mode64);
+      vex_printf(" ; mtctr r10 ; bctrl [");
+      for (n = 0; n < 32; n++) {
+         if (i->Pin.Call.argiregs & (1<<n)) {
+            vex_printf("r%d", n);
+            if ((i->Pin.Call.argiregs >> n) > 1)
+               vex_printf(",");
+         }
+      }
+      vex_printf("] }");
+      break;
+   }
+   case Pin_Goto:
+      vex_printf("goto: ");
+      if (i->Pin.Goto.cond.test != Pct_ALWAYS) {
+         vex_printf("if (%s) ", showPPCCondCode(i->Pin.Goto.cond));
+      }
+      vex_printf("{ ");
+      if (i->Pin.Goto.jk != Ijk_Boring
+          && i->Pin.Goto.jk != Ijk_Call
+          && i->Pin.Goto.jk != Ijk_Ret) {
+         vex_printf("li %%r31,$");
+         ppIRJumpKind(i->Pin.Goto.jk);
+         vex_printf(" ; ");
+      }
+      if (i->Pin.Goto.dst->tag == Pri_Imm) {
+         ppLoadImm(hregPPC_GPR3(mode64), i->Pin.Goto.dst->Pri.Imm,
+                   mode64);
+      } else {
+         ppMovReg(hregPPC_GPR3(mode64), i->Pin.Goto.dst->Pri.Reg);
+      }
+      vex_printf(" ; blr }");
+      return;
+   case Pin_CMov:
+      vex_printf("cmov (%s) ", showPPCCondCode(i->Pin.CMov.cond));
+      ppHRegPPC(i->Pin.CMov.dst);
+      vex_printf(",");
+      ppPPCRI(i->Pin.CMov.src);
+      vex_printf(": ");
+      if (i->Pin.CMov.cond.test != Pct_ALWAYS) {
+         vex_printf("if (%s) ", showPPCCondCode(i->Pin.CMov.cond));
+      }
+      vex_printf("{ ");
+      if (i->Pin.CMov.src->tag == Pri_Imm) {
+         ppLoadImm(i->Pin.CMov.dst, i->Pin.CMov.src->Pri.Imm, mode64);
+      } else {
+         ppMovReg(i->Pin.CMov.dst, i->Pin.CMov.src->Pri.Reg);
+      }
+      vex_printf(" }");
+      return;
+   case Pin_Load: {
+      Bool idxd = toBool(i->Pin.Load.src->tag == Pam_RR);
+      UChar sz = i->Pin.Load.sz;
+      UChar c_sz = sz==1 ? 'b' : sz==2 ? 'h' : sz==4 ? 'w' : 'd';
+      vex_printf("l%c%s%s ", c_sz, sz==8 ? "" : "z", idxd ? "x" : "" );
+      ppHRegPPC(i->Pin.Load.dst);
+      vex_printf(",");
+      ppPPCAMode(i->Pin.Load.src);
+      return;
+   }
+   case Pin_LoadL:
+      vex_printf("l%carx ", i->Pin.LoadL.sz==4 ? 'w' : 'd');
+      ppHRegPPC(i->Pin.LoadL.dst);
+      vex_printf(",%%r0,");
+      ppHRegPPC(i->Pin.LoadL.src);
+      return;
+   case Pin_Store: {
+      UChar sz = i->Pin.Store.sz;
+      Bool idxd = toBool(i->Pin.Store.dst->tag == Pam_RR);
+      UChar c_sz = sz==1 ? 'b' : sz==2 ? 'h' : sz==4 ? 'w' : /*8*/ 'd';
+      vex_printf("st%c%s ", c_sz, idxd ? "x" : "" );
+      ppHRegPPC(i->Pin.Store.src);
+      vex_printf(",");
+      ppPPCAMode(i->Pin.Store.dst);
+      return;
+   }
+   case Pin_StoreC:
+      vex_printf("st%ccx. ", i->Pin.StoreC.sz==4 ? 'w' : 'd');
+      ppHRegPPC(i->Pin.StoreC.src);
+      vex_printf(",%%r0,");
+      ppHRegPPC(i->Pin.StoreC.dst);
+      return;
+   case Pin_Set: {
+      PPCCondCode cc = i->Pin.Set.cond;
+      vex_printf("set (%s),", showPPCCondCode(cc));
+      ppHRegPPC(i->Pin.Set.dst);
+      if (cc.test == Pct_ALWAYS) {
+         vex_printf(": { li ");
+         ppHRegPPC(i->Pin.Set.dst);
+         vex_printf(",1 }");
+      } else {
+         vex_printf(": { mfcr r0 ; rlwinm ");
+         ppHRegPPC(i->Pin.Set.dst);
+         vex_printf(",r0,%u,31,31", cc.flag+1);
+         if (cc.test == Pct_FALSE) {
+            vex_printf("; xori ");
+            ppHRegPPC(i->Pin.Set.dst);
+            vex_printf(",");
+            ppHRegPPC(i->Pin.Set.dst);
+            vex_printf(",1");
+         }
+         vex_printf(" }");
+      }
+      return;
+   }
+   case Pin_MfCR:
+      vex_printf("mfcr ");
+      ppHRegPPC(i->Pin.MfCR.dst);
+      break;
+   case Pin_MFence:
+      vex_printf("mfence (=sync)");
+      return;
+
+   case Pin_FpUnary:
+      vex_printf("%s ", showPPCFpOp(i->Pin.FpUnary.op));
+      ppHRegPPC(i->Pin.FpUnary.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.FpUnary.src);
+      return;
+   case Pin_FpBinary:
+      vex_printf("%s ", showPPCFpOp(i->Pin.FpBinary.op));
+      ppHRegPPC(i->Pin.FpBinary.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.FpBinary.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.FpBinary.srcR);
+      return;
+   case Pin_FpMulAcc:
+      vex_printf("%s ", showPPCFpOp(i->Pin.FpMulAcc.op));
+      ppHRegPPC(i->Pin.FpMulAcc.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.FpMulAcc.srcML);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.FpMulAcc.srcMR);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.FpMulAcc.srcAcc);
+      return;
+   case Pin_FpLdSt: {
+      UChar sz = i->Pin.FpLdSt.sz;
+      Bool idxd = toBool(i->Pin.FpLdSt.addr->tag == Pam_RR);
+      if (i->Pin.FpLdSt.isLoad) {
+         vex_printf("lf%c%s ",
+                    (sz==4 ? 's' : 'd'),
+                    idxd ? "x" : "" );
+         ppHRegPPC(i->Pin.FpLdSt.reg);
+         vex_printf(",");
+         ppPPCAMode(i->Pin.FpLdSt.addr);
+      } else {
+         vex_printf("stf%c%s ",
+                    (sz==4 ? 's' : 'd'),
+                    idxd ? "x" : "" );
+         ppHRegPPC(i->Pin.FpLdSt.reg);
+         vex_printf(",");
+         ppPPCAMode(i->Pin.FpLdSt.addr);
+      }
+      return;
+   }
+   case Pin_FpSTFIW:
+      vex_printf("stfiwz ");
+      ppHRegPPC(i->Pin.FpSTFIW.data);
+      vex_printf(",0(");
+      ppHRegPPC(i->Pin.FpSTFIW.addr);
+      vex_printf(")");
+      return;
+   case Pin_FpRSP:
+      vex_printf("frsp ");
+      ppHRegPPC(i->Pin.FpRSP.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.FpRSP.src);
+      return;
+   case Pin_FpCftI: {
+      HChar* str = "fc???";
+      if (i->Pin.FpCftI.fromI == False && i->Pin.FpCftI.int32 == False)
+         str = "fctid";
+      else
+      if (i->Pin.FpCftI.fromI == False && i->Pin.FpCftI.int32 == True)
+         str = "fctiw";
+      else
+      if (i->Pin.FpCftI.fromI == True && i->Pin.FpCftI.int32 == False)
+         str = "fcfid";
+      vex_printf("%s ", str);
+      ppHRegPPC(i->Pin.FpCftI.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.FpCftI.src);
+      return;
+   }
+   case Pin_FpCMov:
+      vex_printf("fpcmov (%s) ", showPPCCondCode(i->Pin.FpCMov.cond));
+      ppHRegPPC(i->Pin.FpCMov.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.FpCMov.src);
+      vex_printf(": ");
+      vex_printf("if (fr_dst != fr_src) { ");
+      if (i->Pin.FpCMov.cond.test != Pct_ALWAYS) {
+         vex_printf("if (%s) { ", showPPCCondCode(i->Pin.FpCMov.cond));
+      }
+      vex_printf("fmr ");
+      ppHRegPPC(i->Pin.FpCMov.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.FpCMov.src);
+      if (i->Pin.FpCMov.cond.test != Pct_ALWAYS)
+         vex_printf(" }");
+      vex_printf(" }");
+      return;
+   case Pin_FpLdFPSCR:
+      vex_printf("mtfsf 0xFF,");
+      ppHRegPPC(i->Pin.FpLdFPSCR.src);
+      return;
+   case Pin_FpCmp:
+      vex_printf("fcmpo %%cr1,");
+      ppHRegPPC(i->Pin.FpCmp.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.FpCmp.srcR);
+      vex_printf("; mfcr ");
+      ppHRegPPC(i->Pin.FpCmp.dst);
+      vex_printf("; rlwinm ");
+      ppHRegPPC(i->Pin.FpCmp.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.FpCmp.dst);
+      vex_printf(",8,28,31");
+      return;
+
+   case Pin_RdWrLR:
+      vex_printf("%s ", i->Pin.RdWrLR.wrLR ? "mtlr" : "mflr");
+      ppHRegPPC(i->Pin.RdWrLR.gpr);
+      return;
+
+   case Pin_AvLdSt: {
+      UChar  sz = i->Pin.AvLdSt.sz;
+      HChar* str_size;
+      if (i->Pin.AvLdSt.addr->tag == Pam_IR) {
+         ppLoadImm(hregPPC_GPR30(mode64),
+                   i->Pin.AvLdSt.addr->Pam.RR.index, mode64);
+         vex_printf(" ; ");
+      }
+      str_size = sz==1 ? "eb" : sz==2 ? "eh" : sz==4 ? "ew" : "";
+      if (i->Pin.AvLdSt.isLoad)
+         vex_printf("lv%sx ", str_size);
+      else
+         vex_printf("stv%sx ", str_size);
+      ppHRegPPC(i->Pin.AvLdSt.reg);
+      vex_printf(",");
+      if (i->Pin.AvLdSt.addr->tag == Pam_IR)
+         vex_printf("%%r30");
+      else 
+         ppHRegPPC(i->Pin.AvLdSt.addr->Pam.RR.index);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvLdSt.addr->Pam.RR.base);
+      return;
+   }
+   case Pin_AvUnary:
+      vex_printf("%s ", showPPCAvOp(i->Pin.AvUnary.op));
+      ppHRegPPC(i->Pin.AvUnary.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvUnary.src);
+      return;
+   case Pin_AvBinary:
+      vex_printf("%s ", showPPCAvOp(i->Pin.AvBinary.op));
+      ppHRegPPC(i->Pin.AvBinary.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvBinary.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvBinary.srcR);
+      return;
+   case Pin_AvBin8x16:
+      vex_printf("%s(b) ", showPPCAvOp(i->Pin.AvBin8x16.op));
+      ppHRegPPC(i->Pin.AvBin8x16.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvBin8x16.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvBin8x16.srcR);
+      return;
+   case Pin_AvBin16x8:
+      vex_printf("%s(h) ", showPPCAvOp(i->Pin.AvBin16x8.op));
+      ppHRegPPC(i->Pin.AvBin16x8.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvBin16x8.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvBin16x8.srcR);
+      return;
+   case Pin_AvBin32x4:
+      vex_printf("%s(w) ", showPPCAvOp(i->Pin.AvBin32x4.op));
+      ppHRegPPC(i->Pin.AvBin32x4.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvBin32x4.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvBin32x4.srcR);
+      return;
+   case Pin_AvBin32Fx4:
+      vex_printf("%s ", showPPCAvFpOp(i->Pin.AvBin32Fx4.op));
+      ppHRegPPC(i->Pin.AvBin32Fx4.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvBin32Fx4.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvBin32Fx4.srcR);
+      return;
+   case Pin_AvUn32Fx4:
+      vex_printf("%s ", showPPCAvFpOp(i->Pin.AvUn32Fx4.op));
+      ppHRegPPC(i->Pin.AvUn32Fx4.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvUn32Fx4.src);
+      return;
+   case Pin_AvPerm:
+      vex_printf("vperm ");
+      ppHRegPPC(i->Pin.AvPerm.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvPerm.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvPerm.srcR);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvPerm.ctl);
+      return;
+
+   case Pin_AvSel:
+      vex_printf("vsel ");
+      ppHRegPPC(i->Pin.AvSel.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvSel.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvSel.srcR);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvSel.ctl);
+      return;
+
+   case Pin_AvShlDbl:
+      vex_printf("vsldoi ");
+      ppHRegPPC(i->Pin.AvShlDbl.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvShlDbl.srcL);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvShlDbl.srcR);
+      vex_printf(",%d", i->Pin.AvShlDbl.shift);
+      return;
+
+   case Pin_AvSplat: {
+      UChar sz = i->Pin.AvSplat.sz;
+      UChar ch_sz = toUChar( (sz == 8) ? 'b' : (sz == 16) ? 'h' : 'w' );
+      vex_printf("vsplt%s%c ",
+                 i->Pin.AvSplat.src->tag == Pvi_Imm ? "is" : "", ch_sz);
+      ppHRegPPC(i->Pin.AvSplat.dst);
+      vex_printf(",");
+      ppPPCVI5s(i->Pin.AvSplat.src);
+      if (i->Pin.AvSplat.src->tag == Pvi_Reg)
+         vex_printf(", %d", (128/sz)-1);   /* louis lane */
+      return;
+   }
+
+   case Pin_AvCMov:
+      vex_printf("avcmov (%s) ", showPPCCondCode(i->Pin.AvCMov.cond));
+      ppHRegPPC(i->Pin.AvCMov.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvCMov.src);
+      vex_printf(": ");
+      vex_printf("if (v_dst != v_src) { ");
+      if (i->Pin.AvCMov.cond.test != Pct_ALWAYS) {
+         vex_printf("if (%s) { ", showPPCCondCode(i->Pin.AvCMov.cond));
+      }
+      vex_printf("vmr ");
+      ppHRegPPC(i->Pin.AvCMov.dst);
+      vex_printf(",");
+      ppHRegPPC(i->Pin.AvCMov.src);
+      if (i->Pin.FpCMov.cond.test != Pct_ALWAYS)
+         vex_printf(" }");
+      vex_printf(" }");
+      return;
+
+   case Pin_AvLdVSCR:
+      vex_printf("mtvscr ");
+      ppHRegPPC(i->Pin.AvLdVSCR.src);
+      return;
+
+   default:
+      vex_printf("\nppPPCInstr: No such tag(%d)\n", (Int)i->tag);
+      vpanic("ppPPCInstr");
+   }
+}
+
+/* --------- Helpers for register allocation. --------- */
+
+void getRegUsage_PPCInstr ( HRegUsage* u, PPCInstr* i, Bool mode64 )
+{
+   initHRegUsage(u);
+   switch (i->tag) {
+   case Pin_LI:
+      addHRegUse(u, HRmWrite, i->Pin.LI.dst);
+      break;
+   case Pin_Alu:
+      addHRegUse(u, HRmRead,  i->Pin.Alu.srcL);
+      addRegUsage_PPCRH(u,    i->Pin.Alu.srcR);
+      addHRegUse(u, HRmWrite, i->Pin.Alu.dst);
+      return;
+   case Pin_Shft:
+      addHRegUse(u, HRmRead,  i->Pin.Shft.srcL);
+      addRegUsage_PPCRH(u,    i->Pin.Shft.srcR);
+      addHRegUse(u, HRmWrite, i->Pin.Shft.dst);
+      return;
+   case Pin_AddSubC:
+      addHRegUse(u, HRmWrite, i->Pin.AddSubC.dst);
+      addHRegUse(u, HRmRead,  i->Pin.AddSubC.srcL);
+      addHRegUse(u, HRmRead,  i->Pin.AddSubC.srcR);
+      return;
+   case Pin_Cmp:
+      addHRegUse(u, HRmRead, i->Pin.Cmp.srcL);
+      addRegUsage_PPCRH(u,   i->Pin.Cmp.srcR);
+      return;
+   case Pin_Unary:
+      addHRegUse(u, HRmWrite, i->Pin.Unary.dst);
+      addHRegUse(u, HRmRead,  i->Pin.Unary.src);
+      return;
+   case Pin_MulL:
+      addHRegUse(u, HRmWrite, i->Pin.MulL.dst);
+      addHRegUse(u, HRmRead,  i->Pin.MulL.srcL);
+      addHRegUse(u, HRmRead,  i->Pin.MulL.srcR);
+      return;
+   case Pin_Div:
+      addHRegUse(u, HRmWrite, i->Pin.Div.dst);
+      addHRegUse(u, HRmRead,  i->Pin.Div.srcL);
+      addHRegUse(u, HRmRead,  i->Pin.Div.srcR);
+      return;
+   case Pin_Call: {
+      UInt argir;
+      /* This is a bit subtle. */
+      /* First off, claim it trashes all the caller-saved regs
+         which fall within the register allocator's jurisdiction.
+         These I believe to be:
+         mode32: r3 to r12
+         mode64: r3 to r10
+      */
+      /* XXXXXXXXXXXXXXXXX BUG! This doesn't say anything about the FP
+         or Altivec registers.  We get away with this ONLY because
+         getAllocatableRegs_PPC gives the allocator callee-saved fp
+         and Altivec regs, and no caller-save ones. */
+      addHRegUse(u, HRmWrite, hregPPC_GPR3(mode64));
+      addHRegUse(u, HRmWrite, hregPPC_GPR4(mode64));
+      addHRegUse(u, HRmWrite, hregPPC_GPR5(mode64));
+      addHRegUse(u, HRmWrite, hregPPC_GPR6(mode64));
+      addHRegUse(u, HRmWrite, hregPPC_GPR7(mode64));
+      addHRegUse(u, HRmWrite, hregPPC_GPR8(mode64));
+      addHRegUse(u, HRmWrite, hregPPC_GPR9(mode64));
+      addHRegUse(u, HRmWrite, hregPPC_GPR10(mode64));
+      if (!mode64) {
+         addHRegUse(u, HRmWrite, hregPPC_GPR11(mode64));
+         addHRegUse(u, HRmWrite, hregPPC_GPR12(mode64));
+      }
+
+      /* Now we have to state any parameter-carrying registers
+         which might be read.  This depends on the argiregs field. */
+      argir = i->Pin.Call.argiregs;
+      if (argir &(1<<10)) addHRegUse(u, HRmRead, hregPPC_GPR10(mode64));
+      if (argir & (1<<9)) addHRegUse(u, HRmRead, hregPPC_GPR9(mode64));
+      if (argir & (1<<8)) addHRegUse(u, HRmRead, hregPPC_GPR8(mode64));
+      if (argir & (1<<7)) addHRegUse(u, HRmRead, hregPPC_GPR7(mode64));
+      if (argir & (1<<6)) addHRegUse(u, HRmRead, hregPPC_GPR6(mode64));
+      if (argir & (1<<5)) addHRegUse(u, HRmRead, hregPPC_GPR5(mode64));
+      if (argir & (1<<4)) addHRegUse(u, HRmRead, hregPPC_GPR4(mode64));
+      if (argir & (1<<3)) addHRegUse(u, HRmRead, hregPPC_GPR3(mode64));
+
+      vassert(0 == (argir & ~((1<<3)|(1<<4)|(1<<5)|(1<<6)
+                              |(1<<7)|(1<<8)|(1<<9)|(1<<10))));
+
+      /* Finally, there is the issue that the insn trashes a
+         register because the literal target address has to be
+         loaded into a register.  %r10 seems a suitable victim.
+         (Can't use %r0, as some insns interpret it as value zero). */
+      addHRegUse(u, HRmWrite, hregPPC_GPR10(mode64));
+      /* Upshot of this is that the assembler really must use %r10,
+         and no other, as a destination temporary. */
+      return;
+   }
+   case Pin_Goto:
+      addRegUsage_PPCRI(u, i->Pin.Goto.dst);
+      /* GPR3 holds destination address from Pin_Goto */
+      addHRegUse(u, HRmWrite, hregPPC_GPR3(mode64));
+      if (i->Pin.Goto.jk != Ijk_Boring
+          && i->Pin.Goto.jk != Ijk_Call
+          && i->Pin.Goto.jk != Ijk_Ret)
+            /* note, this is irrelevant since the guest state pointer
+               register is not actually available to the allocator.
+               But still .. */
+         addHRegUse(u, HRmWrite, GuestStatePtr(mode64));
+      return;
+   case Pin_CMov:
+      addRegUsage_PPCRI(u,  i->Pin.CMov.src);
+      addHRegUse(u, HRmWrite, i->Pin.CMov.dst);
+      return;
+   case Pin_Load:
+      addRegUsage_PPCAMode(u, i->Pin.Load.src);
+      addHRegUse(u, HRmWrite, i->Pin.Load.dst);
+      return;
+   case Pin_LoadL:
+      addHRegUse(u, HRmRead,  i->Pin.LoadL.src);
+      addHRegUse(u, HRmWrite, i->Pin.LoadL.dst);
+      return;
+   case Pin_Store:
+      addHRegUse(u, HRmRead,  i->Pin.Store.src);
+      addRegUsage_PPCAMode(u, i->Pin.Store.dst);
+      return;
+   case Pin_StoreC:
+      addHRegUse(u, HRmRead, i->Pin.StoreC.src);
+      addHRegUse(u, HRmRead, i->Pin.StoreC.dst);
+      return;
+   case Pin_Set:
+      addHRegUse(u, HRmWrite, i->Pin.Set.dst);
+      return;
+   case Pin_MfCR:
+      addHRegUse(u, HRmWrite, i->Pin.MfCR.dst);
+      return;
+   case Pin_MFence:
+      return;
+
+   case Pin_FpUnary:
+      addHRegUse(u, HRmWrite, i->Pin.FpUnary.dst);
+      addHRegUse(u, HRmRead,  i->Pin.FpUnary.src);
+      return;
+   case Pin_FpBinary:
+      addHRegUse(u, HRmWrite, i->Pin.FpBinary.dst);
+      addHRegUse(u, HRmRead,  i->Pin.FpBinary.srcL);
+      addHRegUse(u, HRmRead,  i->Pin.FpBinary.srcR);
+      return;
+   case Pin_FpMulAcc:
+      addHRegUse(u, HRmWrite, i->Pin.FpMulAcc.dst);
+      addHRegUse(u, HRmRead,  i->Pin.FpMulAcc.srcML);
+      addHRegUse(u, HRmRead,  i->Pin.FpMulAcc.srcMR);
+      addHRegUse(u, HRmRead,  i->Pin.FpMulAcc.srcAcc);
+      return;
+   case Pin_FpLdSt:
+      addHRegUse(u, (i->Pin.FpLdSt.isLoad ? HRmWrite : HRmRead),
+                 i->Pin.FpLdSt.reg);
+      addRegUsage_PPCAMode(u, i->Pin.FpLdSt.addr);
+      return;
+   case Pin_FpSTFIW:
+      addHRegUse(u, HRmRead, i->Pin.FpSTFIW.addr);
+      addHRegUse(u, HRmRead, i->Pin.FpSTFIW.data);
+      return;
+   case Pin_FpRSP:
+      addHRegUse(u, HRmWrite, i->Pin.FpRSP.dst);
+      addHRegUse(u, HRmRead,  i->Pin.FpRSP.src);
+      return;
+   case Pin_FpCftI:
+      addHRegUse(u, HRmWrite, i->Pin.FpCftI.dst);
+      addHRegUse(u, HRmRead,  i->Pin.FpCftI.src);
+      return;
+   case Pin_FpCMov:
+      addHRegUse(u, HRmModify, i->Pin.FpCMov.dst);
+      addHRegUse(u, HRmRead,   i->Pin.FpCMov.src);
+      return;
+   case Pin_FpLdFPSCR:
+      addHRegUse(u, HRmRead, i->Pin.FpLdFPSCR.src);
+      return;
+   case Pin_FpCmp:
+      addHRegUse(u, HRmWrite, i->Pin.FpCmp.dst);
+      addHRegUse(u, HRmRead,  i->Pin.FpCmp.srcL);
+      addHRegUse(u, HRmRead,  i->Pin.FpCmp.srcR);
+      return;
+
+   case Pin_RdWrLR:
+      addHRegUse(u, (i->Pin.RdWrLR.wrLR ? HRmRead : HRmWrite),
+                 i->Pin.RdWrLR.gpr);
+      return;
+
+   case Pin_AvLdSt:
+      addHRegUse(u, (i->Pin.AvLdSt.isLoad ? HRmWrite : HRmRead),
+                 i->Pin.AvLdSt.reg);
+      if (i->Pin.AvLdSt.addr->tag == Pam_IR)
+         addHRegUse(u, HRmWrite, hregPPC_GPR30(mode64));
+      addRegUsage_PPCAMode(u, i->Pin.AvLdSt.addr);
+      return;
+   case Pin_AvUnary:
+      addHRegUse(u, HRmWrite, i->Pin.AvUnary.dst);
+      addHRegUse(u, HRmRead,  i->Pin.AvUnary.src);
+      return;
+   case Pin_AvBinary:
+      if (i->Pin.AvBinary.op == Pav_XOR
+          && i->Pin.AvBinary.dst == i->Pin.AvBinary.srcL
+          && i->Pin.AvBinary.dst == i->Pin.AvBinary.srcR) {
+         /* reg-alloc needs to understand 'xor r,r,r' as a write of r */
+         /* (as opposed to a rite of passage :-) */
+         addHRegUse(u, HRmWrite, i->Pin.AvBinary.dst);
+      } else {
+         addHRegUse(u, HRmWrite, i->Pin.AvBinary.dst);
+         addHRegUse(u, HRmRead,  i->Pin.AvBinary.srcL);
+         addHRegUse(u, HRmRead,  i->Pin.AvBinary.srcR);
+      }
+      return;
+   case Pin_AvBin8x16:
+      addHRegUse(u, HRmWrite, i->Pin.AvBin8x16.dst);
+      addHRegUse(u, HRmRead,  i->Pin.AvBin8x16.srcL);
+      addHRegUse(u, HRmRead,  i->Pin.AvBin8x16.srcR);
+      return;
+   case Pin_AvBin16x8:
+      addHRegUse(u, HRmWrite, i->Pin.AvBin16x8.dst);
+      addHRegUse(u, HRmRead,  i->Pin.AvBin16x8.srcL);
+      addHRegUse(u, HRmRead,  i->Pin.AvBin16x8.srcR);
+      return;
+   case Pin_AvBin32x4:
+      addHRegUse(u, HRmWrite, i->Pin.AvBin32x4.dst);
+      addHRegUse(u, HRmRead,  i->Pin.AvBin32x4.srcL);
+      addHRegUse(u, HRmRead,  i->Pin.AvBin32x4.srcR);
+      return;
+   case Pin_AvBin32Fx4:
+      addHRegUse(u, HRmWrite, i->Pin.AvBin32Fx4.dst);
+      addHRegUse(u, HRmRead,  i->Pin.AvBin32Fx4.srcL);
+      addHRegUse(u, HRmRead,  i->Pin.AvBin32Fx4.srcR);
+      if (i->Pin.AvBin32Fx4.op == Pavfp_MULF)
+         addHRegUse(u, HRmWrite, hregPPC_VR29());
+      return;
+   case Pin_AvUn32Fx4:
+      addHRegUse(u, HRmWrite, i->Pin.AvUn32Fx4.dst);
+      addHRegUse(u, HRmRead,  i->Pin.AvUn32Fx4.src);
+      return;
+   case Pin_AvPerm:
+      addHRegUse(u, HRmWrite, i->Pin.AvPerm.dst);
+      addHRegUse(u, HRmRead,  i->Pin.AvPerm.srcL);
+      addHRegUse(u, HRmRead,  i->Pin.AvPerm.srcR);
+      addHRegUse(u, HRmRead,  i->Pin.AvPerm.ctl);
+      return;
+   case Pin_AvSel:
+      addHRegUse(u, HRmWrite, i->Pin.AvSel.dst);
+      addHRegUse(u, HRmRead,  i->Pin.AvSel.ctl);
+      addHRegUse(u, HRmRead,  i->Pin.AvSel.srcL);
+      addHRegUse(u, HRmRead,  i->Pin.AvSel.srcR);
+      return;
+   case Pin_AvShlDbl:
+      addHRegUse(u, HRmWrite, i->Pin.AvShlDbl.dst);
+      addHRegUse(u, HRmRead,  i->Pin.AvShlDbl.srcL);
+      addHRegUse(u, HRmRead,  i->Pin.AvShlDbl.srcR);
+      return;
+   case Pin_AvSplat:
+      addHRegUse(u, HRmWrite, i->Pin.AvSplat.dst);
+      addRegUsage_PPCVI5s(u,  i->Pin.AvSplat.src);
+      return;
+   case Pin_AvCMov:
+      addHRegUse(u, HRmModify, i->Pin.AvCMov.dst);
+      addHRegUse(u, HRmRead,   i->Pin.AvCMov.src);
+      return;
+   case Pin_AvLdVSCR:
+      addHRegUse(u, HRmRead, i->Pin.AvLdVSCR.src);
+      return;
+
+   default:
+      ppPPCInstr(i, mode64);
+      vpanic("getRegUsage_PPCInstr");
+   }
+}
+
+/* local helper */
+static void mapReg( HRegRemap* m, HReg* r )
+{
+   *r = lookupHRegRemap(m, *r);
+}
+
+void mapRegs_PPCInstr ( HRegRemap* m, PPCInstr* i, Bool mode64 )
+{
+   switch (i->tag) {
+   case Pin_LI:
+      mapReg(m, &i->Pin.LI.dst);
+      return;
+   case Pin_Alu:
+      mapReg(m, &i->Pin.Alu.dst);
+      mapReg(m, &i->Pin.Alu.srcL);
+      mapRegs_PPCRH(m, i->Pin.Alu.srcR);
+      return;
+   case Pin_Shft:
+      mapReg(m, &i->Pin.Shft.dst);
+      mapReg(m, &i->Pin.Shft.srcL);
+      mapRegs_PPCRH(m, i->Pin.Shft.srcR);
+      return;
+   case Pin_AddSubC:
+      mapReg(m, &i->Pin.AddSubC.dst);
+      mapReg(m, &i->Pin.AddSubC.srcL);
+      mapReg(m, &i->Pin.AddSubC.srcR);
+      return;
+   case Pin_Cmp:
+      mapReg(m, &i->Pin.Cmp.srcL);
+      mapRegs_PPCRH(m, i->Pin.Cmp.srcR);
+      return;
+   case Pin_Unary:
+      mapReg(m, &i->Pin.Unary.dst);
+      mapReg(m, &i->Pin.Unary.src);
+      return;
+   case Pin_MulL:
+      mapReg(m, &i->Pin.MulL.dst);
+      mapReg(m, &i->Pin.MulL.srcL);
+      mapReg(m, &i->Pin.MulL.srcR);
+      return;
+   case Pin_Div:
+      mapReg(m, &i->Pin.Div.dst);
+      mapReg(m, &i->Pin.Div.srcL);
+      mapReg(m, &i->Pin.Div.srcR);
+      return;
+   case Pin_Call:
+      return;
+   case Pin_Goto:
+      mapRegs_PPCRI(m, i->Pin.Goto.dst);
+      return;
+   case Pin_CMov:
+      mapRegs_PPCRI(m, i->Pin.CMov.src);
+      mapReg(m, &i->Pin.CMov.dst);
+      return;
+   case Pin_Load:
+      mapRegs_PPCAMode(m, i->Pin.Load.src);
+      mapReg(m, &i->Pin.Load.dst);
+      return;
+   case Pin_LoadL:
+      mapReg(m, &i->Pin.LoadL.src);
+      mapReg(m, &i->Pin.LoadL.dst);
+      return;
+   case Pin_Store:
+      mapReg(m, &i->Pin.Store.src);
+      mapRegs_PPCAMode(m, i->Pin.Store.dst);
+      return;
+   case Pin_StoreC:
+      mapReg(m, &i->Pin.StoreC.src);
+      mapReg(m, &i->Pin.StoreC.dst);
+      return;
+   case Pin_Set:
+      mapReg(m, &i->Pin.Set.dst);
+      return;
+   case Pin_MfCR:
+      mapReg(m, &i->Pin.MfCR.dst);
+      return;
+   case Pin_MFence:
+      return;
+   case Pin_FpUnary:
+      mapReg(m, &i->Pin.FpUnary.dst);
+      mapReg(m, &i->Pin.FpUnary.src);
+      return;
+   case Pin_FpBinary:
+      mapReg(m, &i->Pin.FpBinary.dst);
+      mapReg(m, &i->Pin.FpBinary.srcL);
+      mapReg(m, &i->Pin.FpBinary.srcR);
+      return;
+   case Pin_FpMulAcc:
+      mapReg(m, &i->Pin.FpMulAcc.dst);
+      mapReg(m, &i->Pin.FpMulAcc.srcML);
+      mapReg(m, &i->Pin.FpMulAcc.srcMR);
+      mapReg(m, &i->Pin.FpMulAcc.srcAcc);
+      return;
+   case Pin_FpLdSt:
+      mapReg(m, &i->Pin.FpLdSt.reg);
+      mapRegs_PPCAMode(m, i->Pin.FpLdSt.addr);
+      return;
+   case Pin_FpSTFIW:
+      mapReg(m, &i->Pin.FpSTFIW.addr);
+      mapReg(m, &i->Pin.FpSTFIW.data);
+      return;
+   case Pin_FpRSP:
+      mapReg(m, &i->Pin.FpRSP.dst);
+      mapReg(m, &i->Pin.FpRSP.src);
+      return;
+   case Pin_FpCftI:
+      mapReg(m, &i->Pin.FpCftI.dst);
+      mapReg(m, &i->Pin.FpCftI.src);
+      return;
+   case Pin_FpCMov:
+      mapReg(m, &i->Pin.FpCMov.dst);
+      mapReg(m, &i->Pin.FpCMov.src);
+      return;
+   case Pin_FpLdFPSCR:
+      mapReg(m, &i->Pin.FpLdFPSCR.src);
+      return;
+   case Pin_FpCmp:
+      mapReg(m, &i->Pin.FpCmp.dst);
+      mapReg(m, &i->Pin.FpCmp.srcL);
+      mapReg(m, &i->Pin.FpCmp.srcR);
+      return;
+   case Pin_RdWrLR:
+      mapReg(m, &i->Pin.RdWrLR.gpr);
+      return;
+   case Pin_AvLdSt:
+      mapReg(m, &i->Pin.AvLdSt.reg);
+      mapRegs_PPCAMode(m, i->Pin.AvLdSt.addr);
+      return;
+   case Pin_AvUnary:
+      mapReg(m, &i->Pin.AvUnary.dst);
+      mapReg(m, &i->Pin.AvUnary.src);
+      return;
+   case Pin_AvBinary:
+      mapReg(m, &i->Pin.AvBinary.dst);
+      mapReg(m, &i->Pin.AvBinary.srcL);
+      mapReg(m, &i->Pin.AvBinary.srcR);
+      return;
+   case Pin_AvBin8x16:
+      mapReg(m, &i->Pin.AvBin8x16.dst);
+      mapReg(m, &i->Pin.AvBin8x16.srcL);
+      mapReg(m, &i->Pin.AvBin8x16.srcR);
+      return;
+   case Pin_AvBin16x8:
+      mapReg(m, &i->Pin.AvBin16x8.dst);
+      mapReg(m, &i->Pin.AvBin16x8.srcL);
+      mapReg(m, &i->Pin.AvBin16x8.srcR);
+      return;
+   case Pin_AvBin32x4:
+      mapReg(m, &i->Pin.AvBin32x4.dst);
+      mapReg(m, &i->Pin.AvBin32x4.srcL);
+      mapReg(m, &i->Pin.AvBin32x4.srcR);
+      return;
+   case Pin_AvBin32Fx4:
+      mapReg(m, &i->Pin.AvBin32Fx4.dst);
+      mapReg(m, &i->Pin.AvBin32Fx4.srcL);
+      mapReg(m, &i->Pin.AvBin32Fx4.srcR);
+      return;
+   case Pin_AvUn32Fx4:
+      mapReg(m, &i->Pin.AvUn32Fx4.dst);
+      mapReg(m, &i->Pin.AvUn32Fx4.src);
+      return;
+   case Pin_AvPerm:
+      mapReg(m, &i->Pin.AvPerm.dst);
+      mapReg(m, &i->Pin.AvPerm.srcL);
+      mapReg(m, &i->Pin.AvPerm.srcR);
+      mapReg(m, &i->Pin.AvPerm.ctl);
+      return;
+   case Pin_AvSel:
+      mapReg(m, &i->Pin.AvSel.dst);
+      mapReg(m, &i->Pin.AvSel.srcL);
+      mapReg(m, &i->Pin.AvSel.srcR);
+      mapReg(m, &i->Pin.AvSel.ctl);
+      return;
+   case Pin_AvShlDbl:
+      mapReg(m, &i->Pin.AvShlDbl.dst);
+      mapReg(m, &i->Pin.AvShlDbl.srcL);
+      mapReg(m, &i->Pin.AvShlDbl.srcR);
+      return;
+   case Pin_AvSplat:
+      mapReg(m, &i->Pin.AvSplat.dst);
+      mapRegs_PPCVI5s(m, i->Pin.AvSplat.src);
+      return;
+   case Pin_AvCMov:
+     mapReg(m, &i->Pin.AvCMov.dst);
+     mapReg(m, &i->Pin.AvCMov.src);
+     return;
+   case Pin_AvLdVSCR:
+      mapReg(m, &i->Pin.AvLdVSCR.src);
+      return;
+
+   default:
+      ppPPCInstr(i, mode64);
+      vpanic("mapRegs_PPCInstr");
+   }
+}
+
+/* Figure out if i represents a reg-reg move, and if so assign the
+   source and destination to *src and *dst.  If in doubt say No.  Used
+   by the register allocator to do move coalescing. 
+*/
+Bool isMove_PPCInstr ( PPCInstr* i, HReg* src, HReg* dst )
+{
+   /* Moves between integer regs */
+   if (i->tag == Pin_Alu) {
+      // or Rd,Rs,Rs == mr Rd,Rs
+      if (i->Pin.Alu.op != Palu_OR)
+         return False;
+      if (i->Pin.Alu.srcR->tag != Prh_Reg)
+         return False;
+      if (i->Pin.Alu.srcR->Prh.Reg.reg != i->Pin.Alu.srcL)
+         return False;
+      *src = i->Pin.Alu.srcL;
+      *dst = i->Pin.Alu.dst;
+      return True;
+   }
+   /* Moves between FP regs */
+   if (i->tag == Pin_FpUnary) {
+      if (i->Pin.FpUnary.op != Pfp_MOV)
+         return False;
+      *src = i->Pin.FpUnary.src;
+      *dst = i->Pin.FpUnary.dst;
+      return True;
+   }
+   return False;
+}
+
+
+/* Generate ppc spill/reload instructions under the direction of the
+   register allocator.  Note it's critical these don't write the
+   condition codes. */
+
+void genSpill_PPC ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                    HReg rreg, Int offsetB, Bool mode64 )
+{
+   PPCAMode* am;
+   vassert(!hregIsVirtual(rreg));
+   *i1 = *i2 = NULL;
+   am = PPCAMode_IR( offsetB, GuestStatePtr(mode64) );
+   switch (hregClass(rreg)) {
+      case HRcInt64:
+         vassert(mode64);
+         *i1 = PPCInstr_Store( 8, am, rreg, mode64 );
+         return;
+      case HRcInt32:
+         vassert(!mode64);
+         *i1 = PPCInstr_Store( 4, am, rreg, mode64 );
+         return;
+      case HRcFlt64:
+         *i1 = PPCInstr_FpLdSt ( False/*store*/, 8, rreg, am );
+         return;
+      case HRcVec128:
+         // XXX: GPR30 used as spill register to kludge AltiVec
+         // AMode_IR
+         *i1 = PPCInstr_AvLdSt ( False/*store*/, 16, rreg, am );
+         return;
+      default: 
+         ppHRegClass(hregClass(rreg));
+         vpanic("genSpill_PPC: unimplemented regclass");
+   }
+}
+
+void genReload_PPC ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                     HReg rreg, Int offsetB, Bool mode64 )
+{
+   PPCAMode* am;
+   vassert(!hregIsVirtual(rreg));
+   *i1 = *i2 = NULL;
+   am = PPCAMode_IR( offsetB, GuestStatePtr(mode64) );
+   switch (hregClass(rreg)) {
+      case HRcInt64:
+         vassert(mode64);
+         *i1 = PPCInstr_Load( 8, rreg, am, mode64 );
+         return;
+      case HRcInt32:
+         vassert(!mode64);
+         *i1 = PPCInstr_Load( 4, rreg, am, mode64 );
+         return;
+      case HRcFlt64:
+         *i1 = PPCInstr_FpLdSt ( True/*load*/, 8, rreg, am );
+         return;
+      case HRcVec128:
+         // XXX: GPR30 used as spill register to kludge AltiVec AMode_IR
+         *i1 = PPCInstr_AvLdSt ( True/*load*/, 16, rreg, am );
+         return;
+      default: 
+         ppHRegClass(hregClass(rreg));
+         vpanic("genReload_PPC: unimplemented regclass");
+   }
+}
+
+
+/* --------- The ppc assembler (bleh.) --------- */
+
+static UInt iregNo ( HReg r, Bool mode64 )
+{
+   UInt n;
+   vassert(hregClass(r) == mode64 ? HRcInt64 : HRcInt32);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 32);
+   return n;
+}
+
+static UInt fregNo ( HReg fr )
+{
+   UInt n;
+   vassert(hregClass(fr) == HRcFlt64);
+   vassert(!hregIsVirtual(fr));
+   n = hregNumber(fr);
+   vassert(n <= 32);
+   return n;
+}
+
+static UInt vregNo ( HReg v )
+{
+   UInt n;
+   vassert(hregClass(v) == HRcVec128);
+   vassert(!hregIsVirtual(v));
+   n = hregNumber(v);
+   vassert(n <= 32);
+   return n;
+}
+
+/* Emit 32bit instruction big-endianly */
+static UChar* emit32 ( UChar* p, UInt w32 )
+{
+   *p++ = toUChar((w32 >> 24) & 0x000000FF);
+   *p++ = toUChar((w32 >> 16) & 0x000000FF);
+   *p++ = toUChar((w32 >>  8) & 0x000000FF);
+   *p++ = toUChar((w32)       & 0x000000FF);
+   return p;
+}
+
+/* The following mkForm[...] functions refer to ppc instruction forms
+   as per PPC32 p576
+ */
+
+static UChar* mkFormD ( UChar* p, UInt opc1,
+                        UInt r1, UInt r2, UInt imm )
+{
+   UInt theInstr;
+   vassert(opc1 < 0x40);
+   vassert(r1   < 0x20);
+   vassert(r2   < 0x20);
+   imm = imm & 0xFFFF;
+   theInstr = ((opc1<<26) | (r1<<21) | (r2<<16) | (imm));
+   return emit32(p, theInstr);
+}
+
+static UChar* mkFormMD ( UChar* p, UInt opc1, UInt r1, UInt r2,
+                         UInt imm1, UInt imm2, UInt opc2 )
+{
+   UInt theInstr;
+   vassert(opc1 < 0x40);
+   vassert(r1   < 0x20);
+   vassert(r2   < 0x20);
+   vassert(imm1 < 0x40);
+   vassert(imm2 < 0x40);
+   vassert(opc2 < 0x08);
+   imm2 = ((imm2 & 0x1F) << 1) | (imm2 >> 5);
+   theInstr = ((opc1<<26) | (r1<<21) | (r2<<16) |
+               ((imm1 & 0x1F)<<11) | (imm2<<5) |
+               (opc2<<2) | ((imm1 >> 5)<<1));
+   return emit32(p, theInstr);
+}
+
+static UChar* mkFormX ( UChar* p, UInt opc1, UInt r1, UInt r2,
+                        UInt r3, UInt opc2, UInt b0 )
+{
+   UInt theInstr;
+   vassert(opc1 < 0x40);
+   vassert(r1   < 0x20);
+   vassert(r2   < 0x20);
+   vassert(r3   < 0x20);
+   vassert(opc2 < 0x400);
+   vassert(b0   < 0x2);
+   theInstr = ((opc1<<26) | (r1<<21) | (r2<<16) |
+               (r3<<11) | (opc2<<1) | (b0));
+   return emit32(p, theInstr);
+}
+
+static UChar* mkFormXO ( UChar* p, UInt opc1, UInt r1, UInt r2,
+                         UInt r3, UInt b10, UInt opc2, UInt b0 )
+{
+   UInt theInstr;
+   vassert(opc1 < 0x40);
+   vassert(r1   < 0x20);
+   vassert(r2   < 0x20);
+   vassert(r3   < 0x20);
+   vassert(b10  < 0x2);
+   vassert(opc2 < 0x200);
+   vassert(b0   < 0x2);
+   theInstr = ((opc1<<26) | (r1<<21) | (r2<<16) |
+               (r3<<11) | (b10 << 10) | (opc2<<1) | (b0));
+   return emit32(p, theInstr);
+}
+
+static UChar* mkFormXL ( UChar* p, UInt opc1, UInt f1, UInt f2,
+                         UInt f3, UInt opc2, UInt b0 )
+{
+   UInt theInstr;
+   vassert(opc1 < 0x40);
+   vassert(f1   < 0x20);
+   vassert(f2   < 0x20);
+   vassert(f3   < 0x20);
+   vassert(opc2 < 0x400);
+   vassert(b0   < 0x2);
+   theInstr = ((opc1<<26) | (f1<<21) | (f2<<16) |
+               (f3<<11) | (opc2<<1) | (b0));
+   return emit32(p, theInstr);
+}
+
+// Note: for split field ops, give mnemonic arg
+static UChar* mkFormXFX ( UChar* p, UInt r1, UInt f2, UInt opc2 )
+{
+   UInt theInstr;
+   vassert(r1   < 0x20);
+   vassert(f2   < 0x20);
+   vassert(opc2 < 0x400);
+   switch (opc2) {
+   case 144:  // mtcrf
+      vassert(f2 < 0x100);
+      f2 = f2 << 1;
+      break;
+   case 339:  // mfspr
+   case 371:  // mftb
+   case 467:  // mtspr
+      vassert(f2 < 0x400);
+      // re-arrange split field
+      f2 = ((f2>>5) & 0x1F) | ((f2 & 0x1F)<<5);
+      break;
+   default: vpanic("mkFormXFX(ppch)");
+   }
+   theInstr = ((31<<26) | (r1<<21) | (f2<<11) | (opc2<<1));
+   return emit32(p, theInstr);
+}
+
+// Only used by mtfsf
+static UChar* mkFormXFL ( UChar* p, UInt FM, UInt freg )
+{
+   UInt theInstr;
+   vassert(FM   < 0x100);
+   vassert(freg < 0x20);
+   theInstr = ((63<<26) | (FM<<17) | (freg<<11) | (711<<1));
+   return emit32(p, theInstr);
+}
+
+static UChar* mkFormXS ( UChar* p, UInt opc1, UInt r1, UInt r2,
+                         UInt imm, UInt opc2, UInt b0 )
+{
+   UInt theInstr;
+   vassert(opc1 < 0x40);
+   vassert(r1   < 0x20);
+   vassert(r2   < 0x20);
+   vassert(imm  < 0x40);
+   vassert(opc2 < 0x400);
+   vassert(b0   < 0x2);
+   theInstr = ((opc1<<26) | (r1<<21) | (r2<<16) |
+               ((imm & 0x1F)<<11) | (opc2<<2) | ((imm>>5)<<1) | (b0));
+   return emit32(p, theInstr);
+}
+
+
+#if 0
+// 'b'
+static UChar* mkFormI ( UChar* p, UInt LI, UInt AA, UInt LK )
+{
+   UInt theInstr;
+   vassert(LI  < 0x1000000);
+   vassert(AA  < 0x2);
+   vassert(LK  < 0x2);
+   theInstr = ((18<<26) | (LI<<2) | (AA<<1) | (LK));
+   return emit32(p, theInstr);
+}
+#endif
+
+// 'bc'
+static UChar* mkFormB ( UChar* p, UInt BO, UInt BI,
+                        UInt BD, UInt AA, UInt LK )
+{
+   UInt theInstr;
+   vassert(BO  < 0x20);
+   vassert(BI  < 0x20);
+   vassert(BD  < 0x4000);
+   vassert(AA  < 0x2);
+   vassert(LK  < 0x2);
+   theInstr = ((16<<26) | (BO<<21) | (BI<<16) |
+               (BD<<2) | (AA<<1) | (LK));
+   return emit32(p, theInstr);
+}
+
+// rotates
+static UChar* mkFormM ( UChar* p, UInt opc1, UInt r1, UInt r2,
+                        UInt f3, UInt MB, UInt ME, UInt Rc )
+{
+   UInt theInstr;
+   vassert(opc1 < 0x40);
+   vassert(r1   < 0x20);
+   vassert(r2   < 0x20);
+   vassert(f3   < 0x20);
+   vassert(MB   < 0x20);
+   vassert(ME   < 0x20);
+   vassert(Rc   < 0x2);
+   theInstr = ((opc1<<26) | (r1<<21) | (r2<<16) |
+               (f3<<11) | (MB<<6) | (ME<<1) | (Rc));
+   return emit32(p, theInstr);
+}
+
+static UChar* mkFormA ( UChar* p, UInt opc1, UInt r1, UInt r2,
+                        UInt r3, UInt r4, UInt opc2, UInt b0 )
+{
+   UInt theInstr;
+   vassert(opc1 < 0x40);
+   vassert(r1   < 0x20);
+   vassert(r2   < 0x20);
+   vassert(r3   < 0x20);
+   vassert(r4   < 0x20);
+   vassert(opc2 < 0x20);
+   vassert(b0   < 0x2 );
+   theInstr = ((opc1<<26) | (r1<<21) | (r2<<16) | (r3<<11) |
+               (r4<<6) | (opc2<<1) | (b0));
+   return emit32(p, theInstr);
+}
+
+static UChar* doAMode_IR ( UChar* p, UInt opc1, UInt rSD,
+                           PPCAMode* am, Bool mode64 )
+{
+   UInt rA, idx;
+   vassert(am->tag == Pam_IR);
+   vassert(am->Pam.IR.index < 0x10000);
+
+   rA  = iregNo(am->Pam.IR.base, mode64);
+   idx = am->Pam.IR.index;
+
+   if (opc1 == 58 || opc1 == 62) { // ld/std: mode64 only
+      vassert(mode64);
+      /* stay sane with DS form: lowest 2 bits must be 00.  This
+         should be guaranteed to us by iselWordExpr_AMode. */
+      vassert(0 == (idx & 3));
+   }
+   p = mkFormD(p, opc1, rSD, rA, idx);
+   return p;
+}
+
+static UChar* doAMode_RR ( UChar* p, UInt opc1, UInt opc2,
+                           UInt rSD, PPCAMode* am, Bool mode64 )
+{
+   UInt rA, rB;
+   vassert(am->tag == Pam_RR);
+
+   rA  = iregNo(am->Pam.RR.base, mode64);
+   rB  = iregNo(am->Pam.RR.index, mode64);
+   
+   p = mkFormX(p, opc1, rSD, rA, rB, opc2, 0);
+   return p;
+}
+
+
+/* Load imm to r_dst */
+static UChar* mkLoadImm ( UChar* p, UInt r_dst, ULong imm, Bool mode64 )
+{
+   vassert(r_dst < 0x20);
+
+   if (!mode64) {
+      /* In 32-bit mode, make sure the top 32 bits of imm are a sign
+         extension of the bottom 32 bits, so that the range tests
+         below work correctly. */
+      UInt u32 = (UInt)imm;
+      Int  s32 = (Int)u32;
+      Long s64 = (Long)s32;
+      imm = (ULong)s64;
+   }
+
+   if (imm >= 0xFFFFFFFFFFFF8000ULL || imm < 0x8000) {
+      // sign-extendable from 16 bits
+
+      // addi r_dst,0,imm  => li r_dst,imm
+      p = mkFormD(p, 14, r_dst, 0, imm & 0xFFFF);
+   } else {
+      if (imm >= 0xFFFFFFFF80000000ULL || imm < 0x80000000ULL) {
+         // sign-extendable from 32 bits
+
+         // addis r_dst,r0,(imm>>16) => lis r_dst, (imm>>16)
+         p = mkFormD(p, 15, r_dst, 0, (imm>>16) & 0xFFFF);
+         // ori r_dst, r_dst, (imm & 0xFFFF)
+         p = mkFormD(p, 24, r_dst, r_dst, imm & 0xFFFF);
+      } else {
+         // full 64bit immediate load: 5 (five!) insns.
+         vassert(mode64);
+
+         // load high word
+
+         // lis r_dst, (imm>>48) & 0xFFFF
+         p = mkFormD(p, 15, r_dst, 0, (imm>>48) & 0xFFFF);
+
+         // ori r_dst, r_dst, (imm>>32) & 0xFFFF
+         if ((imm>>32) & 0xFFFF)
+            p = mkFormD(p, 24, r_dst, r_dst, (imm>>32) & 0xFFFF);
+         
+         // shift r_dst low word to high word => rldicr
+         p = mkFormMD(p, 30, r_dst, r_dst, 32, 31, 1);
+
+         // load low word
+
+         // oris r_dst, r_dst, (imm>>16) & 0xFFFF
+         if ((imm>>16) & 0xFFFF)
+            p = mkFormD(p, 25, r_dst, r_dst, (imm>>16) & 0xFFFF);
+
+         // ori r_dst, r_dst, (imm) & 0xFFFF
+         if (imm & 0xFFFF)
+            p = mkFormD(p, 24, r_dst, r_dst, imm & 0xFFFF);
+      }
+   }
+   return p;
+}
+
+/* Move r_dst to r_src */
+static UChar* mkMoveReg ( UChar* p, UInt r_dst, UInt r_src )
+{
+   vassert(r_dst < 0x20);
+   vassert(r_src < 0x20);
+
+   if (r_dst != r_src) {
+      /* or r_dst, r_src, r_src */
+      p = mkFormX(p, 31, r_src, r_dst, r_src, 444, 0 );
+   }
+   return p;
+}
+
+static UChar* mkFormVX ( UChar* p, UInt opc1, UInt r1, UInt r2,
+                         UInt r3, UInt opc2 )
+{
+   UInt theInstr;
+   vassert(opc1 < 0x40);
+   vassert(r1   < 0x20);
+   vassert(r2   < 0x20);
+   vassert(r3   < 0x20);
+   vassert(opc2 < 0x800);
+   theInstr = ((opc1<<26) | (r1<<21) | (r2<<16) | (r3<<11) | opc2);
+   return emit32(p, theInstr);
+}
+
+static UChar* mkFormVXR ( UChar* p, UInt opc1, UInt r1, UInt r2,
+                          UInt r3, UInt Rc, UInt opc2 )
+{
+   UInt theInstr;
+   vassert(opc1 < 0x40);
+   vassert(r1   < 0x20);
+   vassert(r2   < 0x20);
+   vassert(r3   < 0x20);
+   vassert(Rc   < 0x2);
+   vassert(opc2 < 0x400);
+   theInstr = ((opc1<<26) | (r1<<21) | (r2<<16) |
+               (r3<<11) | (Rc<<10) | opc2);
+   return emit32(p, theInstr);
+}
+
+static UChar* mkFormVA ( UChar* p, UInt opc1, UInt r1, UInt r2,
+                         UInt r3, UInt r4, UInt opc2 )
+{
+   UInt theInstr;
+   vassert(opc1 < 0x40);
+   vassert(r1   < 0x20);
+   vassert(r2   < 0x20);
+   vassert(r3   < 0x20);
+   vassert(r4   < 0x20);
+   vassert(opc2 < 0x40);
+   theInstr = ((opc1<<26) | (r1<<21) | (r2<<16) |
+               (r3<<11) | (r4<<6) | opc2);
+   return emit32(p, theInstr);
+}
+
+
+
+/* Emit an instruction into buf and return the number of bytes used.
+   Note that buf is not the insn's final place, and therefore it is
+   imperative to emit position-independent code. 
+
+   Note, dispatch should always be NULL since ppc32/64 backends
+   use a call-return scheme to get from the dispatcher to generated
+   code and back.
+*/
+Int emit_PPCInstr ( UChar* buf, Int nbuf, PPCInstr* i, 
+                    Bool mode64, void* dispatch )
+{
+   UChar* p = &buf[0];
+   UChar* ptmp = p;
+   vassert(nbuf >= 32);
+
+   if (0) {
+      vex_printf("asm  ");ppPPCInstr(i, mode64); vex_printf("\n");
+   }
+
+   switch (i->tag) {
+
+   case Pin_LI:
+      p = mkLoadImm(p, iregNo(i->Pin.LI.dst, mode64),
+                    i->Pin.LI.imm64, mode64);
+      goto done;
+
+   case Pin_Alu: {
+      PPCRH* srcR   = i->Pin.Alu.srcR;
+      Bool   immR   = toBool(srcR->tag == Prh_Imm);
+      UInt   r_dst  = iregNo(i->Pin.Alu.dst, mode64);
+      UInt   r_srcL = iregNo(i->Pin.Alu.srcL, mode64);
+      UInt   r_srcR = immR ? (-1)/*bogus*/ :
+                             iregNo(srcR->Prh.Reg.reg, mode64);
+
+      switch (i->Pin.Alu.op) {
+      case Palu_ADD:
+         if (immR) {
+            /* addi (PPC32 p350) */
+            vassert(srcR->Prh.Imm.syned);
+            vassert(srcR->Prh.Imm.imm16 != 0x8000);
+            p = mkFormD(p, 14, r_dst, r_srcL, srcR->Prh.Imm.imm16);
+         } else {
+            /* add (PPC32 p347) */
+            p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 266, 0);
+         }
+         break;
+
+      case Palu_SUB:
+         if (immR) {
+            /* addi (PPC32 p350), but with negated imm */
+            vassert(srcR->Prh.Imm.syned);
+            vassert(srcR->Prh.Imm.imm16 != 0x8000);
+            p = mkFormD(p, 14, r_dst, r_srcL, (- srcR->Prh.Imm.imm16));
+         } else {
+            /* subf (PPC32 p537), with args the "wrong" way round */
+            p = mkFormXO(p, 31, r_dst, r_srcR, r_srcL, 0, 40, 0);
+         }
+         break;
+
+      case Palu_AND:
+         if (immR) {
+            /* andi. (PPC32 p358) */
+            vassert(!srcR->Prh.Imm.syned);
+            p = mkFormD(p, 28, r_srcL, r_dst, srcR->Prh.Imm.imm16);
+         } else {
+            /* and (PPC32 p356) */
+            p = mkFormX(p, 31, r_srcL, r_dst, r_srcR, 28, 0);
+         }
+         break;
+
+      case Palu_OR:
+         if (immR) {
+            /* ori (PPC32 p497) */
+            vassert(!srcR->Prh.Imm.syned);
+            p = mkFormD(p, 24, r_srcL, r_dst, srcR->Prh.Imm.imm16);
+         } else {
+            /* or (PPC32 p495) */
+            p = mkFormX(p, 31, r_srcL, r_dst, r_srcR, 444, 0);
+         }
+         break;
+
+      case Palu_XOR:
+         if (immR) {
+            /* xori (PPC32 p550) */
+            vassert(!srcR->Prh.Imm.syned);
+            p = mkFormD(p, 26, r_srcL, r_dst, srcR->Prh.Imm.imm16);
+         } else {
+            /* xor (PPC32 p549) */
+            p = mkFormX(p, 31, r_srcL, r_dst, r_srcR, 316, 0);
+         }
+         break;
+
+      default:
+         goto bad;
+      }
+      goto done;
+   }
+
+   case Pin_Shft: {
+      PPCRH* srcR   = i->Pin.Shft.srcR;
+      Bool   sz32   = i->Pin.Shft.sz32;
+      Bool   immR   = toBool(srcR->tag == Prh_Imm);
+      UInt   r_dst  = iregNo(i->Pin.Shft.dst, mode64);
+      UInt   r_srcL = iregNo(i->Pin.Shft.srcL, mode64);
+      UInt   r_srcR = immR ? (-1)/*bogus*/ :
+                             iregNo(srcR->Prh.Reg.reg, mode64);
+      if (!mode64)
+         vassert(sz32);
+
+      switch (i->Pin.Shft.op) {
+      case Pshft_SHL:
+         if (sz32) {
+            if (immR) {
+               /* rd = rs << n, 1 <= n <= 31
+                  is
+                  rlwinm rd,rs,n,0,31-n  (PPC32 p501)
+               */
+               UInt n = srcR->Prh.Imm.imm16;
+               vassert(!srcR->Prh.Imm.syned);
+               vassert(n > 0 && n < 32);
+               p = mkFormM(p, 21, r_srcL, r_dst, n, 0, 31-n, 0);
+            } else {
+               /* slw (PPC32 p505) */
+               p = mkFormX(p, 31, r_srcL, r_dst, r_srcR, 24, 0);
+            }
+         } else {
+            if (immR) {
+               /* rd = rs << n, 1 <= n <= 63
+                  is
+                  rldicr rd,rs,n,63-n  (PPC64 p559)
+               */
+               UInt n = srcR->Prh.Imm.imm16;
+               vassert(!srcR->Prh.Imm.syned);
+               vassert(n > 0 && n < 64);
+               p = mkFormMD(p, 30, r_srcL, r_dst, n, 63-n, 1);
+            } else {
+               /* sld (PPC64 p568) */
+               p = mkFormX(p, 31, r_srcL, r_dst, r_srcR, 27, 0);
+            }
+         }
+         break;
+
+      case Pshft_SHR:
+         if (sz32) {
+             if (immR) {
+               /* rd = rs >>u n, 1 <= n <= 31
+                  is
+                  rlwinm rd,rs,32-n,n,31  (PPC32 p501)
+               */
+               UInt n = srcR->Prh.Imm.imm16;
+               vassert(!srcR->Prh.Imm.syned);
+               vassert(n > 0 && n < 32);
+               p = mkFormM(p, 21, r_srcL, r_dst, 32-n, n, 31, 0);
+            } else {
+               /* srw (PPC32 p508) */
+               p = mkFormX(p, 31, r_srcL, r_dst, r_srcR, 536, 0);
+            }
+         } else {
+            if (immR) {
+               /* rd = rs >>u n, 1 <= n <= 63
+                  is
+                  rldicl rd,rs,64-n,n  (PPC64 p558)
+               */
+               UInt n = srcR->Prh.Imm.imm16;
+               vassert(!srcR->Prh.Imm.syned);
+               vassert(n > 0 && n < 64);
+               p = mkFormMD(p, 30, r_srcL, r_dst, 64-n, n, 0);
+            } else {
+               /* srd (PPC64 p574) */
+               p = mkFormX(p, 31, r_srcL, r_dst, r_srcR, 539, 0);
+            }
+         }
+         break;
+
+      case Pshft_SAR:
+         if (sz32) {
+            if (immR) {
+               /* srawi (PPC32 p507) */
+               UInt n = srcR->Prh.Imm.imm16;
+               vassert(!srcR->Prh.Imm.syned);
+               /* In 64-bit mode, we allow right shifts by zero bits
+                  as that is a handy way to sign extend the lower 32
+                  bits into the upper 32 bits. */
+               if (mode64)
+                  vassert(n >= 0 && n < 32);
+               else 
+                  vassert(n > 0 && n < 32);
+               p = mkFormX(p, 31, r_srcL, r_dst, n, 824, 0);
+            } else {
+               /* sraw (PPC32 p506) */
+               p = mkFormX(p, 31, r_srcL, r_dst, r_srcR, 792, 0);
+            }
+         } else {
+            if (immR) {
+               /* sradi (PPC64 p571) */
+               UInt n = srcR->Prh.Imm.imm16;
+               vassert(!srcR->Prh.Imm.syned);
+               vassert(n > 0 && n < 64);
+               p = mkFormXS(p, 31, r_srcL, r_dst, n, 413, 0);
+            } else {
+               /* srad (PPC32 p570) */
+               p = mkFormX(p, 31, r_srcL, r_dst, r_srcR, 794, 0);
+            }
+         }
+         break;
+
+      default:
+         goto bad;
+      }
+      goto done;
+   }
+
+   case Pin_AddSubC: {
+      Bool isAdd  = i->Pin.AddSubC.isAdd;
+      Bool setC   = i->Pin.AddSubC.setC;
+      UInt r_srcL = iregNo(i->Pin.AddSubC.srcL, mode64);
+      UInt r_srcR = iregNo(i->Pin.AddSubC.srcR, mode64);
+      UInt r_dst  = iregNo(i->Pin.AddSubC.dst, mode64);
+      
+      if (isAdd) {
+         if (setC) /* addc (PPC32 p348) */
+            p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 10, 0);
+         else          /* adde (PPC32 p349) */
+            p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 138, 0);
+      } else {
+         /* subfX, with args the "wrong" way round */
+         if (setC) /* subfc (PPC32 p538) */
+            p = mkFormXO(p, 31, r_dst, r_srcR, r_srcL, 0, 8, 0);
+         else          /* subfe (PPC32 p539) */
+            p = mkFormXO(p, 31, r_dst, r_srcR, r_srcL, 0, 136, 0);
+      }
+      goto done;
+   }
+
+   case Pin_Cmp: {
+      Bool syned  = i->Pin.Cmp.syned;
+      Bool sz32   = i->Pin.Cmp.sz32;
+      UInt fld1   = i->Pin.Cmp.crfD << 2;
+      UInt r_srcL = iregNo(i->Pin.Cmp.srcL, mode64);
+      UInt r_srcR, imm_srcR;
+      PPCRH* srcR = i->Pin.Cmp.srcR;
+
+      if (!mode64)        // cmp double word invalid for mode32
+         vassert(sz32);      
+      else if (!sz32)     // mode64 && cmp64: set L=1
+         fld1 |= 1;
+ 
+      switch (srcR->tag) {
+      case Prh_Imm:
+         vassert(syned == srcR->Prh.Imm.syned);
+         imm_srcR = srcR->Prh.Imm.imm16;
+         if (syned) {  // cmpw/di  (signed)   (PPC32 p368)
+            vassert(imm_srcR != 0x8000);
+            p = mkFormD(p, 11, fld1, r_srcL, imm_srcR);
+         } else {      // cmplw/di (unsigned) (PPC32 p370)
+            p = mkFormD(p, 10, fld1, r_srcL, imm_srcR);
+         }
+         break;
+      case Prh_Reg:
+         r_srcR = iregNo(srcR->Prh.Reg.reg, mode64);
+         if (syned)  // cmpwi  (signed)   (PPC32 p367)
+            p = mkFormX(p, 31, fld1, r_srcL, r_srcR, 0, 0);
+         else        // cmplwi (unsigned) (PPC32 p379)
+            p = mkFormX(p, 31, fld1, r_srcL, r_srcR, 32, 0);
+         break;
+      default: 
+         goto bad;
+      }        
+      goto done;
+   }
+
+   case Pin_Unary: {
+      UInt r_dst = iregNo(i->Pin.Unary.dst, mode64);
+      UInt r_src = iregNo(i->Pin.Unary.src, mode64);
+
+      switch (i->Pin.Unary.op) {
+      case Pun_NOT:  // nor r_dst,r_src,r_src
+         p = mkFormX(p, 31, r_src, r_dst, r_src, 124, 0);
+         break;
+      case Pun_NEG:  // neg r_dst,r_src
+         p = mkFormXO(p, 31, r_dst, r_src, 0, 0, 104, 0);
+         break;
+      case Pun_CLZ32:  // cntlzw r_dst, r_src
+         p = mkFormX(p, 31, r_src, r_dst, 0, 26, 0);
+         break;
+      case Pun_CLZ64:  // cntlzd r_dst, r_src
+         vassert(mode64);
+         p = mkFormX(p, 31, r_src, r_dst, 0, 58, 0);
+         break;
+      case Pun_EXTSW:  // extsw r_dst, r_src
+         vassert(mode64);
+         p = mkFormX(p, 31, r_src, r_dst, 0, 986, 0);
+         break;
+      default: goto bad;
+      }
+      goto done;
+   }
+
+   case Pin_MulL: {
+      Bool syned  = i->Pin.MulL.syned;
+      Bool sz32   = i->Pin.MulL.sz32;
+      UInt r_dst  = iregNo(i->Pin.MulL.dst, mode64);
+      UInt r_srcL = iregNo(i->Pin.MulL.srcL, mode64);
+      UInt r_srcR = iregNo(i->Pin.MulL.srcR, mode64);
+
+      if (!mode64)
+         vassert(sz32);
+
+      if (i->Pin.MulL.hi) {
+         // mul hi words, must consider sign
+         if (sz32) {
+            if (syned)  // mulhw r_dst,r_srcL,r_srcR
+               p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 75, 0);
+            else        // mulhwu r_dst,r_srcL,r_srcR
+               p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 11, 0);
+         } else {
+            if (syned)  // mulhd r_dst,r_srcL,r_srcR
+               p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 73, 0);
+            else        // mulhdu r_dst,r_srcL,r_srcR
+               p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 9, 0);
+         }
+      } else {
+         // mul low word, sign is irrelevant
+         vassert(!i->Pin.MulL.syned);
+         if (sz32)      // mullw r_dst,r_srcL,r_srcR
+            p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 235, 0);
+         else           // mulld r_dst,r_srcL,r_srcR
+            p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 233, 0);
+      }
+      goto done;
+   }
+
+   case Pin_Div: {
+      Bool syned  = i->Pin.Div.syned;
+      Bool sz32   = i->Pin.Div.sz32;
+      UInt r_dst  = iregNo(i->Pin.Div.dst, mode64);
+      UInt r_srcL = iregNo(i->Pin.Div.srcL, mode64);
+      UInt r_srcR = iregNo(i->Pin.Div.srcR, mode64);
+
+      if (!mode64)
+         vassert(sz32);
+
+      if (sz32) {
+         if (syned)  // divw r_dst,r_srcL,r_srcR
+            p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 491, 0);
+         else        // divwu r_dst,r_srcL,r_srcR
+            p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 459, 0);
+      } else {
+         if (syned)  // divd r_dst,r_srcL,r_srcR
+            p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 489, 0);
+         else        // divdu r_dst,r_srcL,r_srcR
+            p = mkFormXO(p, 31, r_dst, r_srcL, r_srcR, 0, 457, 0);
+      }
+      goto done;
+   }
+
+   case Pin_Call: {
+      PPCCondCode cond  = i->Pin.Call.cond;
+      UInt        r_dst = 10;
+      /* As per detailed comment for Pin_Call in
+         getRegUsage_PPCInstr above, %r10 is used as an address temp */
+
+      /* jump over the following insns if condition does not hold */
+      if (cond.test != Pct_ALWAYS) {
+         /* jmp fwds if !condition */
+         /* don't know how many bytes to jump over yet...
+            make space for a jump instruction and fill in later. */
+         ptmp = p; /* fill in this bit later */
+         p += 4;                                          // p += 4
+      }
+
+      /* load target to r_dst */                          // p += 4|8|20
+      p = mkLoadImm(p, r_dst, i->Pin.Call.target, mode64);
+
+      /* mtspr 9,r_dst => move r_dst to count register */
+      p = mkFormXFX(p, r_dst, 9, 467);                    // p += 4
+      
+      /* bctrl => branch to count register (and save to lr) */
+      p = mkFormXL(p, 19, Pct_ALWAYS, 0, 0, 528, 1);      // p += 4
+
+      /* Fix up the conditional jump, if there was one. */
+      if (cond.test != Pct_ALWAYS) {
+         Int delta = p - ptmp;
+         vassert(delta >= 16 && delta <= 32);
+         /* bc !ct,cf,delta */
+         mkFormB(ptmp, invertCondTest(cond.test),
+                 cond.flag, (delta>>2), 0, 0);
+      }
+      goto done;
+   }
+
+   case Pin_Goto: {
+      UInt        trc   = 0;
+      UChar       r_ret = 3;        /* Put target addr into %r3 */
+      PPCCondCode cond  = i->Pin.Goto.cond;
+      UInt r_dst;
+      ULong imm_dst;
+
+      vassert(dispatch == NULL);
+      
+      /* First off, if this is conditional, create a conditional
+         jump over the rest of it. */
+      if (cond.test != Pct_ALWAYS) {
+         /* jmp fwds if !condition */
+         /* don't know how many bytes to jump over yet...
+            make space for a jump instruction and fill in later. */
+         ptmp = p; /* fill in this bit later */
+         p += 4;
+      }
+
+      // cond succeeds...
+      
+      /* If a non-boring, set GuestStatePtr appropriately. */
+      switch (i->Pin.Goto.jk) {
+         case Ijk_ClientReq:   trc = VEX_TRC_JMP_CLIENTREQ;   break;
+         case Ijk_Sys_syscall: trc = VEX_TRC_JMP_SYS_SYSCALL; break;
+         case Ijk_Yield:       trc = VEX_TRC_JMP_YIELD;       break;
+         case Ijk_EmWarn:      trc = VEX_TRC_JMP_EMWARN;      break;
+         case Ijk_EmFail:      trc = VEX_TRC_JMP_EMFAIL;      break;
+         case Ijk_MapFail:     trc = VEX_TRC_JMP_MAPFAIL;     break;
+         case Ijk_NoDecode:    trc = VEX_TRC_JMP_NODECODE;    break;
+         case Ijk_TInval:      trc = VEX_TRC_JMP_TINVAL;      break;
+         case Ijk_NoRedir:     trc = VEX_TRC_JMP_NOREDIR;     break;
+         case Ijk_SigTRAP:     trc = VEX_TRC_JMP_SIGTRAP;     break;
+         case Ijk_SigBUS:      trc = VEX_TRC_JMP_SIGBUS;      break;
+         case Ijk_Ret:
+         case Ijk_Call:
+         case Ijk_Boring:
+            break;
+         default: 
+            ppIRJumpKind(i->Pin.Goto.jk);
+            vpanic("emit_PPCInstr.Pin_Goto: unknown jump kind");
+      }
+      if (trc !=0) {
+         vassert(trc < 0x10000);
+         /* addi r31,0,trc */
+         p = mkFormD(p, 14, 31, 0, trc);               // p += 4
+      }
+
+      /* Get the destination address into %r_ret */
+      if (i->Pin.Goto.dst->tag == Pri_Imm) {
+         imm_dst = i->Pin.Goto.dst->Pri.Imm;
+         p = mkLoadImm(p, r_ret, imm_dst, mode64);     // p += 4|8|20
+      } else {
+         vassert(i->Pin.Goto.dst->tag == Pri_Reg);
+         r_dst = iregNo(i->Pin.Goto.dst->Pri.Reg, mode64);
+         p = mkMoveReg(p, r_ret, r_dst);               // p += 4
+      }
+      
+      /* blr */
+      p = mkFormXL(p, 19, Pct_ALWAYS, 0, 0, 16, 0);    // p += 4
+
+      /* Fix up the conditional jump, if there was one. */
+      if (cond.test != Pct_ALWAYS) {
+         Int delta = p - ptmp;
+         vassert(delta >= 12 && delta <= 32);
+         /* bc !ct,cf,delta */
+         mkFormB(ptmp, invertCondTest(cond.test),
+                 cond.flag, delta>>2, 0, 0);
+      }
+      goto done;
+   }
+
+   case Pin_CMov: {
+      UInt  r_dst, r_src;
+      ULong imm_src;
+      PPCCondCode cond;
+      vassert(i->Pin.CMov.cond.test != Pct_ALWAYS);
+
+      r_dst = iregNo(i->Pin.CMov.dst, mode64);
+      cond = i->Pin.CMov.cond;
+
+      /* branch (if cond fails) over move instrs */
+      if (cond.test != Pct_ALWAYS) {
+         /* don't know how many bytes to jump over yet...
+            make space for a jump instruction and fill in later. */
+         ptmp = p; /* fill in this bit later */
+         p += 4;
+      }
+
+      // cond true: move src => dst
+      switch (i->Pin.CMov.src->tag) {
+      case Pri_Imm:
+         imm_src = i->Pin.CMov.src->Pri.Imm;
+         p = mkLoadImm(p, r_dst, imm_src, mode64);  // p += 4|8|20
+         break;
+      case Pri_Reg:
+         r_src = iregNo(i->Pin.CMov.src->Pri.Reg, mode64);
+         p = mkMoveReg(p, r_dst, r_src);            // p += 4
+         break;
+      default: goto bad;
+      }
+
+      /* Fix up the conditional jump, if there was one. */
+      if (cond.test != Pct_ALWAYS) {
+         Int delta = p - ptmp;
+         vassert(delta >= 8 && delta <= 24);
+         /* bc !ct,cf,delta */
+         mkFormB(ptmp, invertCondTest(cond.test),
+                 cond.flag, (delta>>2), 0, 0);
+      }
+      goto done;
+   }
+
+   case Pin_Load: {
+      PPCAMode* am_addr = i->Pin.Load.src;
+      UInt r_dst = iregNo(i->Pin.Load.dst, mode64);
+      UInt opc1, opc2, sz = i->Pin.Load.sz;
+      switch (am_addr->tag) {
+      case Pam_IR:
+         if (mode64 && (sz == 4 || sz == 8)) {
+            /* should be guaranteed to us by iselWordExpr_AMode */
+            vassert(0 == (am_addr->Pam.IR.index & 3));
+         }
+         switch(sz) {
+            case 1:  opc1 = 34; break;
+            case 2:  opc1 = 40; break;
+            case 4:  opc1 = 32; break;
+            case 8:  opc1 = 58; vassert(mode64); break;
+            default: goto bad;
+         }
+         p = doAMode_IR(p, opc1, r_dst, am_addr, mode64);
+         goto done;
+      case Pam_RR:
+         switch(sz) {
+            case 1:  opc2 = 87;  break;
+            case 2:  opc2 = 279; break;
+            case 4:  opc2 = 23;  break;
+            case 8:  opc2 = 21; vassert(mode64); break;
+            default: goto bad;
+         }
+         p = doAMode_RR(p, 31, opc2, r_dst, am_addr, mode64);
+         goto done;
+      default:
+         goto bad;
+      }
+   }
+
+   case Pin_LoadL: {
+      if (i->Pin.LoadL.sz == 4) {
+         p = mkFormX(p, 31, iregNo(i->Pin.LoadL.dst, mode64),
+                     0, iregNo(i->Pin.LoadL.src, mode64), 20, 0);
+         goto done;
+      }
+      if (i->Pin.LoadL.sz == 8 && mode64) {
+         p = mkFormX(p, 31, iregNo(i->Pin.LoadL.dst, mode64),
+                     0, iregNo(i->Pin.LoadL.src, mode64), 84, 0);
+         goto done;
+      }
+      goto bad;
+   }
+
+   case Pin_Set: {
+      /* Make the destination register be 1 or 0, depending on whether
+         the relevant condition holds. */
+      UInt        r_dst = iregNo(i->Pin.Set.dst, mode64);
+      PPCCondCode cond  = i->Pin.Set.cond;
+      UInt rot_imm, r_tmp;
+
+      if (cond.test == Pct_ALWAYS) {
+         // Just load 1 to dst => li dst,1
+         p = mkFormD(p, 14, r_dst, 0, 1);
+      } else {
+         rot_imm = 1 + cond.flag;
+         r_tmp = 0;  // Not set in getAllocable, so no need to declare.
+
+         // r_tmp = CR  => mfcr r_tmp
+         p = mkFormX(p, 31, r_tmp, 0, 0, 19, 0);
+
+         // r_dst = flag (rotate left and mask)
+         //  => rlwinm r_dst,r_tmp,rot_imm,31,31
+         p = mkFormM(p, 21, r_tmp, r_dst, rot_imm, 31, 31, 0);
+
+         if (cond.test == Pct_FALSE) {
+            // flip bit  => xori r_dst,r_dst,1
+            p = mkFormD(p, 26, r_dst, r_dst, 1);
+         }
+      }
+      goto done;
+   }
+
+   case Pin_MfCR:
+      // mfcr dst
+      p = mkFormX(p, 31, iregNo(i->Pin.MfCR.dst, mode64), 0, 0, 19, 0);
+      goto done;
+
+   case Pin_MFence: {
+      p = mkFormX(p, 31, 0, 0, 0, 598, 0);   // sync, PPC32 p616
+      // CAB: Should this be isync?
+      //    p = mkFormXL(p, 19, 0, 0, 0, 150, 0);  // isync, PPC32 p467
+      goto done;
+   }
+
+   case Pin_Store: {
+      PPCAMode* am_addr = i->Pin.Store.dst;
+      UInt r_src = iregNo(i->Pin.Store.src, mode64);
+      UInt opc1, opc2, sz = i->Pin.Store.sz;
+      switch (i->Pin.Store.dst->tag) {
+      case Pam_IR:
+         if (mode64 && (sz == 4 || sz == 8)) {
+            /* should be guaranteed to us by iselWordExpr_AMode */
+            vassert(0 == (am_addr->Pam.IR.index & 3));
+         }
+         switch(sz) {
+         case 1: opc1 = 38; break;
+         case 2: opc1 = 44; break;
+         case 4: opc1 = 36; break;
+         case 8: vassert(mode64);
+                 opc1 = 62; break;
+         default:
+            goto bad;
+         }
+         p = doAMode_IR(p, opc1, r_src, am_addr, mode64);
+         goto done;
+      case Pam_RR:
+         switch(sz) {
+         case 1: opc2 = 215; break;
+         case 2: opc2 = 407; break;
+         case 4: opc2 = 151; break;
+         case 8: vassert(mode64);
+                 opc2 = 149; break;
+         default:
+            goto bad;
+         }
+         p = doAMode_RR(p, 31, opc2, r_src, am_addr, mode64);
+         goto done;
+      default:
+         goto bad;
+      }
+      goto done;
+   }
+
+   case Pin_StoreC: {
+      if (i->Pin.StoreC.sz == 4) {
+         p = mkFormX(p, 31, iregNo(i->Pin.StoreC.src, mode64),
+                     0, iregNo(i->Pin.StoreC.dst, mode64), 150, 1);
+         goto done;
+      }
+      if (i->Pin.StoreC.sz == 8 && mode64) {
+         p = mkFormX(p, 31, iregNo(i->Pin.StoreC.src, mode64),
+                     0, iregNo(i->Pin.StoreC.dst, mode64), 214, 1);
+         goto done;
+      }
+      goto bad;
+   }
+
+   case Pin_FpUnary: {
+      UInt fr_dst = fregNo(i->Pin.FpUnary.dst);
+      UInt fr_src = fregNo(i->Pin.FpUnary.src);
+      switch (i->Pin.FpUnary.op) {
+      case Pfp_RSQRTE: // frsqrtre, PPC32 p424
+         p = mkFormA( p, 63, fr_dst, 0, fr_src, 0, 26, 0 );
+         break;
+      case Pfp_RES:   // fres, PPC32 p421
+         p = mkFormA( p, 59, fr_dst, 0, fr_src, 0, 24, 0 );
+         break;
+      case Pfp_SQRT:  // fsqrt, PPC32 p427
+         p = mkFormA( p, 63, fr_dst, 0, fr_src, 0, 22, 0 );
+         break;
+      case Pfp_ABS:   // fabs, PPC32 p399
+         p = mkFormX(p, 63, fr_dst, 0, fr_src, 264, 0);
+         break;
+      case Pfp_NEG:   // fneg, PPC32 p416
+         p = mkFormX(p, 63, fr_dst, 0, fr_src, 40, 0);
+         break;
+      case Pfp_MOV:   // fmr, PPC32 p410
+         p = mkFormX(p, 63, fr_dst, 0, fr_src, 72, 0);
+         break;
+      case Pfp_FRIM:  // frim, PPC ISA 2.05 p137
+         p = mkFormX(p, 63, fr_dst, 0, fr_src, 488, 0);
+         break;
+      case Pfp_FRIP:  // frip, PPC ISA 2.05 p137
+         p = mkFormX(p, 63, fr_dst, 0, fr_src, 456, 0);
+         break;
+      case Pfp_FRIN:  // frin, PPC ISA 2.05 p137
+         p = mkFormX(p, 63, fr_dst, 0, fr_src, 392, 0);
+         break;
+      case Pfp_FRIZ:  // friz, PPC ISA 2.05 p137
+         p = mkFormX(p, 63, fr_dst, 0, fr_src, 424, 0);
+         break;
+      default:
+         goto bad;
+      }
+      goto done;
+   }
+
+   case Pin_FpBinary: {
+      UInt fr_dst  = fregNo(i->Pin.FpBinary.dst);
+      UInt fr_srcL = fregNo(i->Pin.FpBinary.srcL);
+      UInt fr_srcR = fregNo(i->Pin.FpBinary.srcR);
+      switch (i->Pin.FpBinary.op) {
+      case Pfp_ADDD:   // fadd, PPC32 p400
+         p = mkFormA( p, 63, fr_dst, fr_srcL, fr_srcR, 0, 21, 0 );
+         break;
+      case Pfp_ADDS:   // fadds, PPC32 p401
+         p = mkFormA( p, 59, fr_dst, fr_srcL, fr_srcR, 0, 21, 0 );
+         break;
+      case Pfp_SUBD:   // fsub, PPC32 p429
+         p = mkFormA( p, 63, fr_dst, fr_srcL, fr_srcR, 0, 20, 0 );
+         break;
+      case Pfp_SUBS:   // fsubs, PPC32 p430
+         p = mkFormA( p, 59, fr_dst, fr_srcL, fr_srcR, 0, 20, 0 );
+         break;
+      case Pfp_MULD:   // fmul, PPC32 p413
+         p = mkFormA( p, 63, fr_dst, fr_srcL, 0, fr_srcR, 25, 0 );
+         break;
+      case Pfp_MULS:   // fmuls, PPC32 p414
+         p = mkFormA( p, 59, fr_dst, fr_srcL, 0, fr_srcR, 25, 0 );
+         break;
+      case Pfp_DIVD:   // fdiv, PPC32 p406
+         p = mkFormA( p, 63, fr_dst, fr_srcL, fr_srcR, 0, 18, 0 );
+         break;
+      case Pfp_DIVS:   // fdivs, PPC32 p407
+         p = mkFormA( p, 59, fr_dst, fr_srcL, fr_srcR, 0, 18, 0 );
+         break;
+      default:
+         goto bad;
+      }
+      goto done;
+   }
+
+   case Pin_FpMulAcc: {
+      UInt fr_dst    = fregNo(i->Pin.FpMulAcc.dst);
+      UInt fr_srcML  = fregNo(i->Pin.FpMulAcc.srcML);
+      UInt fr_srcMR  = fregNo(i->Pin.FpMulAcc.srcMR);
+      UInt fr_srcAcc = fregNo(i->Pin.FpMulAcc.srcAcc);
+      switch (i->Pin.FpMulAcc.op) {
+      case Pfp_MADDD:   // fmadd, PPC32 p408
+         p = mkFormA( p, 63, fr_dst, fr_srcML, fr_srcAcc, fr_srcMR, 29, 0 );
+         break;
+      case Pfp_MADDS:   // fmadds, PPC32 p409
+         p = mkFormA( p, 59, fr_dst, fr_srcML, fr_srcAcc, fr_srcMR, 29, 0 );
+         break;
+      case Pfp_MSUBD:   // fmsub, PPC32 p411
+         p = mkFormA( p, 63, fr_dst, fr_srcML, fr_srcAcc, fr_srcMR, 28, 0 );
+         break;
+      case Pfp_MSUBS:   // fmsubs, PPC32 p412
+         p = mkFormA( p, 59, fr_dst, fr_srcML, fr_srcAcc, fr_srcMR, 28, 0 );
+         break;
+      default:
+         goto bad;
+      }
+      goto done;
+   }
+
+   case Pin_FpLdSt: {
+      PPCAMode* am_addr = i->Pin.FpLdSt.addr;
+      UInt f_reg = fregNo(i->Pin.FpLdSt.reg);
+      Bool idxd = toBool(i->Pin.FpLdSt.addr->tag == Pam_RR);
+      UChar sz = i->Pin.FpLdSt.sz;
+      UInt opc;
+      vassert(sz == 4 || sz == 8);
+
+      if (i->Pin.FpLdSt.isLoad) {   // Load from memory
+         if (idxd) {  // lf[s|d]x, PPC32 p444|440
+            opc = (sz == 4) ? 535 : 599;
+            p = doAMode_RR(p, 31, opc, f_reg, am_addr, mode64);
+         } else {     // lf[s|d], PPC32 p441|437
+            opc = (sz == 4) ? 48 : 50;
+            p = doAMode_IR(p, opc, f_reg, am_addr, mode64);
+         }
+      } else {                      // Store to memory
+         if (idxd) { // stf[s|d]x, PPC32 p521|516
+            opc = (sz == 4) ? 663 : 727;
+            p = doAMode_RR(p, 31, opc, f_reg, am_addr, mode64);
+         } else {    // stf[s|d], PPC32 p518|513
+            opc = (sz == 4) ? 52 : 54;
+            p = doAMode_IR(p, opc, f_reg, am_addr, mode64);
+         }
+      }
+      goto done;
+   }
+
+   case Pin_FpSTFIW: {
+      UInt ir_addr = iregNo(i->Pin.FpSTFIW.addr, mode64);
+      UInt fr_data = fregNo(i->Pin.FpSTFIW.data);
+      // stfiwx (store fp64[lo32] as int32), PPC32 p517
+      // Use rA==0, so that EA == rB == ir_addr
+      p = mkFormX(p, 31, fr_data, 0/*rA=0*/, ir_addr, 983, 0);
+      goto done;
+   }
+
+   case Pin_FpRSP: {
+      UInt fr_dst = fregNo(i->Pin.FpRSP.dst);
+      UInt fr_src = fregNo(i->Pin.FpRSP.src);
+      // frsp, PPC32 p423
+      p = mkFormX(p, 63, fr_dst, 0, fr_src, 12, 0);
+      goto done;
+   }
+
+   case Pin_FpCftI: {
+      UInt fr_dst = fregNo(i->Pin.FpCftI.dst);
+      UInt fr_src = fregNo(i->Pin.FpCftI.src);
+      if (i->Pin.FpCftI.fromI == False && i->Pin.FpCftI.int32 == True) {
+         // fctiw (conv f64 to i32), PPC32 p404
+         p = mkFormX(p, 63, fr_dst, 0, fr_src, 14, 0);
+         goto done;
+      }
+      if (i->Pin.FpCftI.fromI == False && i->Pin.FpCftI.int32 == False) {
+         // fctid (conv f64 to i64), PPC64 p437
+         p = mkFormX(p, 63, fr_dst, 0, fr_src, 814, 0);
+         goto done;
+      }
+      if (i->Pin.FpCftI.fromI == True && i->Pin.FpCftI.int32 == False) {
+         // fcfid (conv i64 to f64), PPC64 p434
+         p = mkFormX(p, 63, fr_dst, 0, fr_src, 846, 0);
+         goto done;
+      }
+      goto bad;
+   }
+
+   case Pin_FpCMov: {
+      UInt        fr_dst = fregNo(i->Pin.FpCMov.dst);
+      UInt        fr_src = fregNo(i->Pin.FpCMov.src);
+      PPCCondCode cc     = i->Pin.FpCMov.cond;
+
+      if (fr_dst == fr_src) goto done;
+      
+      vassert(cc.test != Pct_ALWAYS);
+
+      /* jmp fwds if !condition */
+      if (cc.test != Pct_ALWAYS) {
+         /* bc !ct,cf,n_bytes>>2 */
+         p = mkFormB(p, invertCondTest(cc.test), cc.flag, 8>>2, 0, 0);
+      }
+
+      // fmr, PPC32 p410
+      p = mkFormX(p, 63, fr_dst, 0, fr_src, 72, 0);
+      goto done;
+   }
+
+   case Pin_FpLdFPSCR: {
+      UInt fr_src = fregNo(i->Pin.FpLdFPSCR.src);
+      p = mkFormXFL(p, 0xFF, fr_src);     // mtfsf, PPC32 p480
+      goto done;
+   }
+
+   case Pin_FpCmp: {
+      UChar crfD    = 1;
+      UInt  r_dst   = iregNo(i->Pin.FpCmp.dst, mode64);
+      UInt  fr_srcL = fregNo(i->Pin.FpCmp.srcL);
+      UInt  fr_srcR = fregNo(i->Pin.FpCmp.srcR);
+      vassert(crfD < 8);
+      // fcmpo, PPC32 p402
+      p = mkFormX(p, 63, crfD<<2, fr_srcL, fr_srcR, 32, 0);
+
+      // mfcr (mv CR to r_dst), PPC32 p467
+      p = mkFormX(p, 31, r_dst, 0, 0, 19, 0);
+      
+      // rlwinm r_dst,r_dst,8,28,31, PPC32 p501
+      //  => rotate field 1 to bottomw of word, masking out upper 28
+      p = mkFormM(p, 21, r_dst, r_dst, 8, 28, 31, 0);
+      goto done;
+   }
+
+   case Pin_RdWrLR: {
+      UInt reg = iregNo(i->Pin.RdWrLR.gpr, mode64);
+      /* wrLR==True ? mtlr r4 : mflr r4 */
+      p = mkFormXFX(p, reg, 8, (i->Pin.RdWrLR.wrLR==True) ? 467 : 339);
+      goto done;
+   }
+
+
+   /* AltiVec */
+   case Pin_AvLdSt: {
+      UInt opc2, v_reg, r_idx, r_base;
+      UChar sz   = i->Pin.AvLdSt.sz;
+      Bool  idxd = toBool(i->Pin.AvLdSt.addr->tag == Pam_RR);
+      vassert(sz == 1 || sz == 2 || sz == 4 || sz == 16);
+
+      v_reg  = vregNo(i->Pin.AvLdSt.reg);
+      r_base = iregNo(i->Pin.AvLdSt.addr->Pam.RR.base, mode64);
+
+      // Only have AltiVec AMode_RR: kludge AMode_IR
+      if (!idxd) {
+         r_idx = 30;                       // XXX: Using r30 as temp
+         p = mkLoadImm(p, r_idx,
+                       i->Pin.AvLdSt.addr->Pam.IR.index, mode64);
+      } else {
+         r_idx  = iregNo(i->Pin.AvLdSt.addr->Pam.RR.index, mode64);
+      }
+
+      if (i->Pin.FpLdSt.isLoad) {  // Load from memory (1,2,4,16)
+         opc2 = (sz==1) ?   7 : (sz==2) ?  39 : (sz==4) ?  71 : 103;
+         p = mkFormX(p, 31, v_reg, r_idx, r_base, opc2, 0);
+      } else {                      // Store to memory (1,2,4,16)
+         opc2 = (sz==1) ? 135 : (sz==2) ? 167 : (sz==4) ? 199 : 231;
+         p = mkFormX(p, 31, v_reg, r_idx, r_base, opc2, 0);
+      }
+      goto done;
+   }
+
+   case Pin_AvUnary: {
+      UInt v_dst = vregNo(i->Pin.AvUnary.dst);
+      UInt v_src = vregNo(i->Pin.AvUnary.src);
+      UInt opc2;
+      switch (i->Pin.AvUnary.op) {
+      case Pav_MOV:       opc2 = 1156; break; // vor vD,vS,vS
+      case Pav_NOT:       opc2 = 1284; break; // vnor vD,vS,vS
+      case Pav_UNPCKH8S:  opc2 =  526; break; // vupkhsb
+      case Pav_UNPCKH16S: opc2 =  590; break; // vupkhsh
+      case Pav_UNPCKL8S:  opc2 =  654; break; // vupklsb
+      case Pav_UNPCKL16S: opc2 =  718; break; // vupklsh
+      case Pav_UNPCKHPIX: opc2 =  846; break; // vupkhpx
+      case Pav_UNPCKLPIX: opc2 =  974; break; // vupklpx
+      default:
+         goto bad;
+      }
+      switch (i->Pin.AvUnary.op) {
+      case Pav_MOV:
+      case Pav_NOT:
+         p = mkFormVX( p, 4, v_dst, v_src, v_src, opc2 );
+         break;
+      default:
+         p = mkFormVX( p, 4, v_dst, 0, v_src, opc2 );
+         break;
+      }
+      goto done;
+   }
+
+   case Pin_AvBinary: {
+      UInt v_dst  = vregNo(i->Pin.AvBinary.dst);
+      UInt v_srcL = vregNo(i->Pin.AvBinary.srcL);
+      UInt v_srcR = vregNo(i->Pin.AvBinary.srcR);
+      UInt opc2;
+      if (i->Pin.AvBinary.op == Pav_SHL) {
+         p = mkFormVX( p, 4, v_dst, v_srcL, v_srcR, 1036 ); // vslo
+         p = mkFormVX( p, 4, v_dst, v_dst,  v_srcR, 452 );  // vsl
+         goto done;
+      }
+      if (i->Pin.AvBinary.op == Pav_SHR) {
+         p = mkFormVX( p, 4, v_dst, v_srcL, v_srcR, 1100 ); // vsro
+         p = mkFormVX( p, 4, v_dst, v_dst,  v_srcR, 708 );  // vsr
+         goto done;
+      }
+      switch (i->Pin.AvBinary.op) {
+      /* Bitwise */
+      case Pav_AND:       opc2 = 1028; break; // vand
+      case Pav_OR:        opc2 = 1156; break; // vor
+      case Pav_XOR:       opc2 = 1220; break; // vxor
+      default:
+         goto bad;
+      }
+      p = mkFormVX( p, 4, v_dst, v_srcL, v_srcR, opc2 );
+      goto done;
+   }
+
+   case Pin_AvBin8x16: {
+      UInt v_dst  = vregNo(i->Pin.AvBin8x16.dst);
+      UInt v_srcL = vregNo(i->Pin.AvBin8x16.srcL);
+      UInt v_srcR = vregNo(i->Pin.AvBin8x16.srcR);
+      UInt opc2;
+      switch (i->Pin.AvBin8x16.op) {
+
+      case Pav_ADDU:     opc2 =    0; break; // vaddubm
+      case Pav_QADDU:    opc2 =  512; break; // vaddubs
+      case Pav_QADDS:    opc2 =  768; break; // vaddsbs
+
+      case Pav_SUBU:     opc2 = 1024; break; // vsububm
+      case Pav_QSUBU:    opc2 = 1536; break; // vsububs
+      case Pav_QSUBS:    opc2 = 1792; break; // vsubsbs
+
+      case Pav_OMULU:   opc2 =    8; break; // vmuloub
+      case Pav_OMULS:   opc2 =  264; break; // vmulosb
+      case Pav_EMULU:   opc2 =  520; break; // vmuleub
+      case Pav_EMULS:   opc2 =  776; break; // vmulesb
+
+      case Pav_AVGU:     opc2 = 1026; break; // vavgub
+      case Pav_AVGS:     opc2 = 1282; break; // vavgsb
+      case Pav_MAXU:     opc2 =    2; break; // vmaxub
+      case Pav_MAXS:     opc2 =  258; break; // vmaxsb
+      case Pav_MINU:     opc2 =  514; break; // vminub
+      case Pav_MINS:     opc2 =  770; break; // vminsb
+
+      case Pav_CMPEQU:   opc2 =    6; break; // vcmpequb
+      case Pav_CMPGTU:   opc2 =  518; break; // vcmpgtub
+      case Pav_CMPGTS:   opc2 =  774; break; // vcmpgtsb
+
+      case Pav_SHL:      opc2 =  260; break; // vslb
+      case Pav_SHR:      opc2 =  516; break; // vsrb
+      case Pav_SAR:      opc2 =  772; break; // vsrab
+      case Pav_ROTL:     opc2 =    4; break; // vrlb
+
+      case Pav_MRGHI:    opc2 =   12; break; // vmrghb
+      case Pav_MRGLO:    opc2 =  268; break; // vmrglb
+
+      default:
+         goto bad;
+      }
+      p = mkFormVX( p, 4, v_dst, v_srcL, v_srcR, opc2 );
+      goto done;
+   }
+
+   case Pin_AvBin16x8: {
+      UInt v_dst  = vregNo(i->Pin.AvBin16x8.dst);
+      UInt v_srcL = vregNo(i->Pin.AvBin16x8.srcL);
+      UInt v_srcR = vregNo(i->Pin.AvBin16x8.srcR);
+      UInt opc2;
+      switch (i->Pin.AvBin16x8.op) {
+
+      case Pav_ADDU:    opc2 =   64; break; // vadduhm
+      case Pav_QADDU:   opc2 =  576; break; // vadduhs
+      case Pav_QADDS:   opc2 =  832; break; // vaddshs
+
+      case Pav_SUBU:    opc2 = 1088; break; // vsubuhm
+      case Pav_QSUBU:   opc2 = 1600; break; // vsubuhs
+      case Pav_QSUBS:   opc2 = 1856; break; // vsubshs
+
+      case Pav_OMULU:   opc2 =   72; break; // vmulouh
+      case Pav_OMULS:   opc2 =  328; break; // vmulosh
+      case Pav_EMULU:   opc2 =  584; break; // vmuleuh
+      case Pav_EMULS:   opc2 =  840; break; // vmulesh
+
+      case Pav_AVGU:    opc2 = 1090; break; // vavguh
+      case Pav_AVGS:    opc2 = 1346; break; // vavgsh
+      case Pav_MAXU:    opc2 =   66; break; // vmaxuh
+      case Pav_MAXS:    opc2 =  322; break; // vmaxsh
+      case Pav_MINS:    opc2 =  834; break; // vminsh
+      case Pav_MINU:    opc2 =  578; break; // vminuh
+
+      case Pav_CMPEQU:  opc2 =   70; break; // vcmpequh
+      case Pav_CMPGTU:  opc2 =  582; break; // vcmpgtuh
+      case Pav_CMPGTS:  opc2 =  838; break; // vcmpgtsh
+
+      case Pav_SHL:     opc2 =  324; break; // vslh
+      case Pav_SHR:     opc2 =  580; break; // vsrh
+      case Pav_SAR:     opc2 =  836; break; // vsrah
+      case Pav_ROTL:    opc2 =   68; break; // vrlh
+
+      case Pav_PACKUU:  opc2 =   14; break; // vpkuhum
+      case Pav_QPACKUU: opc2 =  142; break; // vpkuhus
+      case Pav_QPACKSU: opc2 =  270; break; // vpkshus
+      case Pav_QPACKSS: opc2 =  398; break; // vpkshss
+      case Pav_PACKPXL: opc2 =  782; break; // vpkpx
+
+      case Pav_MRGHI:   opc2 =   76; break; // vmrghh
+      case Pav_MRGLO:   opc2 =  332; break; // vmrglh
+
+      default:
+         goto bad;
+      }
+      p = mkFormVX( p, 4, v_dst, v_srcL, v_srcR, opc2 );
+      goto done;
+   }
+
+   case Pin_AvBin32x4: {
+      UInt v_dst  = vregNo(i->Pin.AvBin32x4.dst);
+      UInt v_srcL = vregNo(i->Pin.AvBin32x4.srcL);
+      UInt v_srcR = vregNo(i->Pin.AvBin32x4.srcR);
+      UInt opc2;
+      switch (i->Pin.AvBin32x4.op) {
+
+      case Pav_ADDU:    opc2 =  128; break; // vadduwm
+      case Pav_QADDU:   opc2 =  640; break; // vadduws
+      case Pav_QADDS:   opc2 =  896; break; // vaddsws
+
+      case Pav_SUBU:    opc2 = 1152; break; // vsubuwm
+      case Pav_QSUBU:   opc2 = 1664; break; // vsubuws
+      case Pav_QSUBS:   opc2 = 1920; break; // vsubsws
+
+      case Pav_AVGU:    opc2 = 1154; break; // vavguw
+      case Pav_AVGS:    opc2 = 1410; break; // vavgsw
+
+      case Pav_MAXU:    opc2 =  130; break; // vmaxuw
+      case Pav_MAXS:    opc2 =  386; break; // vmaxsw
+
+      case Pav_MINS:    opc2 =  898; break; // vminsw
+      case Pav_MINU:    opc2 =  642; break; // vminuw
+
+      case Pav_CMPEQU:  opc2 =  134; break; // vcmpequw
+      case Pav_CMPGTS:  opc2 =  902; break; // vcmpgtsw
+      case Pav_CMPGTU:  opc2 =  646; break; // vcmpgtuw
+
+      case Pav_SHL:     opc2 =  388; break; // vslw
+      case Pav_SHR:     opc2 =  644; break; // vsrw
+      case Pav_SAR:     opc2 =  900; break; // vsraw
+      case Pav_ROTL:    opc2 =  132; break; // vrlw
+
+      case Pav_PACKUU:  opc2 =   78; break; // vpkuwum
+      case Pav_QPACKUU: opc2 =  206; break; // vpkuwus
+      case Pav_QPACKSU: opc2 =  334; break; // vpkswus
+      case Pav_QPACKSS: opc2 =  462; break; // vpkswss
+
+      case Pav_MRGHI:   opc2 =  140; break; // vmrghw
+      case Pav_MRGLO:   opc2 =  396; break; // vmrglw
+
+      default:
+         goto bad;
+      }
+      p = mkFormVX( p, 4, v_dst, v_srcL, v_srcR, opc2 );
+      goto done;
+   }
+
+   case Pin_AvBin32Fx4: {
+      UInt v_dst  = vregNo(i->Pin.AvBin32Fx4.dst);
+      UInt v_srcL = vregNo(i->Pin.AvBin32Fx4.srcL);
+      UInt v_srcR = vregNo(i->Pin.AvBin32Fx4.srcR);
+      switch (i->Pin.AvBin32Fx4.op) {
+
+      case Pavfp_ADDF:
+         p = mkFormVX( p, 4, v_dst, v_srcL, v_srcR, 10 );   // vaddfp
+         break;
+      case Pavfp_SUBF:
+         p = mkFormVX( p, 4, v_dst, v_srcL, v_srcR, 74 );   // vsubfp
+         break;
+      case Pavfp_MAXF:
+         p = mkFormVX( p, 4, v_dst, v_srcL, v_srcR, 1034 ); // vmaxfp
+         break;
+      case Pavfp_MINF:
+         p = mkFormVX( p, 4, v_dst, v_srcL, v_srcR, 1098 ); // vminfp
+         break;
+
+      case Pavfp_MULF: {
+         /* Make a vmulfp from a vmaddfp:
+            load -0.0 (0x8000_0000) to each 32-bit word of vB
+            this makes the add a noop.
+         */
+         UInt vB = 29;  // XXX: Using v29 for temp do not change
+                        // without also changing
+                        // getRegUsage_PPCInstr
+         UInt konst = 0x1F;
+
+         // Better way to load -0.0 (0x80000000) ?
+         // vspltisw vB,0x1F   (0x1F => each word of vB)
+         p = mkFormVX( p, 4, vB, konst, 0, 908 );
+
+         // vslw vB,vB,vB (each word of vB = (0x1F << 0x1F) = 0x80000000
+         p = mkFormVX( p, 4, vB, vB, vB, 388 );
+
+         // Finally, do the multiply:
+         p = mkFormVA( p, 4, v_dst, v_srcL, vB, v_srcR, 46 );
+         break;
+      }
+      case Pavfp_CMPEQF:  // vcmpeqfp
+         p = mkFormVXR( p, 4, v_dst, v_srcL, v_srcR, 0, 198 );
+         break;
+      case Pavfp_CMPGTF:  // vcmpgtfp
+         p = mkFormVXR( p, 4, v_dst, v_srcL, v_srcR, 0, 710 );
+         break;
+      case Pavfp_CMPGEF:  // vcmpgefp
+         p = mkFormVXR( p, 4, v_dst, v_srcL, v_srcR, 0, 454 );
+         break;
+
+      default:
+         goto bad;
+      }
+      goto done;
+   }
+
+   case Pin_AvUn32Fx4: {
+      UInt v_dst = vregNo(i->Pin.AvUn32Fx4.dst);
+      UInt v_src = vregNo(i->Pin.AvUn32Fx4.src);
+      UInt opc2;
+      switch (i->Pin.AvUn32Fx4.op) {
+      case Pavfp_RCPF:    opc2 =  266; break; // vrefp
+      case Pavfp_RSQRTF:  opc2 =  330; break; // vrsqrtefp
+      case Pavfp_CVTU2F:  opc2 =  778; break; // vcfux
+      case Pavfp_CVTS2F:  opc2 =  842; break; // vcfsx
+      case Pavfp_QCVTF2U: opc2 =  906; break; // vctuxs
+      case Pavfp_QCVTF2S: opc2 =  970; break; // vctsxs
+      case Pavfp_ROUNDM:  opc2 =  714; break; // vrfim
+      case Pavfp_ROUNDP:  opc2 =  650; break; // vrfip
+      case Pavfp_ROUNDN:  opc2 =  522; break; // vrfin
+      case Pavfp_ROUNDZ:  opc2 =  586; break; // vrfiz
+      default:
+         goto bad;
+      }
+      p = mkFormVX( p, 4, v_dst, 0, v_src, opc2 );
+      goto done;
+   }
+
+   case Pin_AvPerm: {  // vperm
+      UInt v_dst  = vregNo(i->Pin.AvPerm.dst);
+      UInt v_srcL = vregNo(i->Pin.AvPerm.srcL);
+      UInt v_srcR = vregNo(i->Pin.AvPerm.srcR);
+      UInt v_ctl  = vregNo(i->Pin.AvPerm.ctl);
+      p = mkFormVA( p, 4, v_dst, v_srcL, v_srcR, v_ctl, 43 );
+      goto done;
+   }
+
+   case Pin_AvSel: {  // vsel
+      UInt v_ctl  = vregNo(i->Pin.AvSel.ctl);
+      UInt v_dst  = vregNo(i->Pin.AvSel.dst);
+      UInt v_srcL = vregNo(i->Pin.AvSel.srcL);
+      UInt v_srcR = vregNo(i->Pin.AvSel.srcR);
+      p = mkFormVA( p, 4, v_dst, v_srcL, v_srcR, v_ctl, 42 );
+      goto done;
+   }
+
+   case Pin_AvShlDbl: {  // vsldoi
+      UInt shift  = i->Pin.AvShlDbl.shift;
+      UInt v_dst  = vregNo(i->Pin.AvShlDbl.dst);
+      UInt v_srcL = vregNo(i->Pin.AvShlDbl.srcL);
+      UInt v_srcR = vregNo(i->Pin.AvShlDbl.srcR);
+      vassert(shift <= 0xF);
+      p = mkFormVA( p, 4, v_dst, v_srcL, v_srcR, shift, 44 );
+      goto done;
+   }
+
+   case Pin_AvSplat: { // vsplt(is)(b,h,w)
+      UInt v_dst = vregNo(i->Pin.AvShlDbl.dst);
+      UChar sz   = i->Pin.AvSplat.sz;
+      UInt v_src, opc2;
+      vassert(sz == 8 || sz == 16 || sz == 32);
+
+      if (i->Pin.AvSplat.src->tag == Pvi_Imm) {
+         Char simm5;
+         opc2 = (sz == 8) ? 780 : (sz == 16) ? 844 : 908;   // 8,16,32
+         /* expects 5-bit-signed-imm */
+         simm5 = i->Pin.AvSplat.src->Pvi.Imm5s;
+         vassert(simm5 >= -16 && simm5 <= 15);
+         simm5 = simm5 & 0x1F;
+         p = mkFormVX( p, 4, v_dst, (UInt)simm5, 0, opc2 );
+      }
+      else {  // Pri_Reg
+         UInt lowest_lane;
+         opc2 = (sz == 8) ? 524 : (sz == 16) ? 588 : 652;  // 8,16,32
+         vassert(hregClass(i->Pin.AvSplat.src->Pvi.Reg) == HRcVec128);
+         v_src = vregNo(i->Pin.AvSplat.src->Pvi.Reg);
+         lowest_lane = (128/sz)-1;
+         p = mkFormVX( p, 4, v_dst, lowest_lane, v_src, opc2 );
+      }
+      goto done;
+   }
+
+   case Pin_AvCMov: {
+      UInt v_dst     = vregNo(i->Pin.AvCMov.dst);
+      UInt v_src     = vregNo(i->Pin.AvCMov.src);
+      PPCCondCode cc = i->Pin.AvCMov.cond;
+
+      if (v_dst == v_src) goto done;
+      
+      vassert(cc.test != Pct_ALWAYS);
+
+      /* jmp fwds 2 insns if !condition */
+      if (cc.test != Pct_ALWAYS) {
+         /* bc !ct,cf,n_bytes>>2 */
+         p = mkFormB(p, invertCondTest(cc.test), cc.flag, 8>>2, 0, 0);
+      }
+      /* vmr */
+      p = mkFormVX( p, 4, v_dst, v_src, v_src, 1156 );
+      goto done;
+   }
+
+   case Pin_AvLdVSCR: {  // mtvscr
+      UInt v_src = vregNo(i->Pin.AvLdVSCR.src);
+      p = mkFormVX( p, 4, 0, 0, v_src, 1604 );
+      goto done;
+   }
+
+   default: 
+      goto bad;
+   }
+
+  bad:
+   vex_printf("\n=> ");
+   ppPPCInstr(i, mode64);
+   vpanic("emit_PPCInstr");
+   /*NOTREACHED*/
+   
+  done:
+   vassert(p - &buf[0] <= 32);
+   return p - &buf[0];
+}
+
+/*---------------------------------------------------------------*/
+/*--- end                                     host_ppc_defs.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_ppc_defs.h b/VEX/priv/host_ppc_defs.h
new file mode 100644
index 0000000..accfd58
--- /dev/null
+++ b/VEX/priv/host_ppc_defs.h

@@ -0,0 +1,861 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                   host_ppc_defs.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#ifndef __VEX_HOST_PPC_DEFS_H
+#define __VEX_HOST_PPC_DEFS_H
+
+/* Num registers used for function calls */
+#define PPC_N_REGPARMS 8
+
+
+/* --------- Registers. --------- */
+
+/* The usual HReg abstraction.  There are 32 real int regs,
+   32 real float regs, and 32 real vector regs. 
+*/
+
+extern void ppHRegPPC ( HReg );
+
+extern HReg hregPPC_GPR0  ( Bool mode64 ); // scratch reg / zero reg
+extern HReg hregPPC_GPR1  ( Bool mode64 ); // Stack Frame Pointer
+extern HReg hregPPC_GPR2  ( Bool mode64 ); // not used: TOC pointer
+extern HReg hregPPC_GPR3  ( Bool mode64 );
+extern HReg hregPPC_GPR4  ( Bool mode64 );
+extern HReg hregPPC_GPR5  ( Bool mode64 );
+extern HReg hregPPC_GPR6  ( Bool mode64 );
+extern HReg hregPPC_GPR7  ( Bool mode64 );
+extern HReg hregPPC_GPR8  ( Bool mode64 );
+extern HReg hregPPC_GPR9  ( Bool mode64 );
+extern HReg hregPPC_GPR10 ( Bool mode64 );
+extern HReg hregPPC_GPR11 ( Bool mode64 );
+extern HReg hregPPC_GPR12 ( Bool mode64 );
+extern HReg hregPPC_GPR13 ( Bool mode64 );
+extern HReg hregPPC_GPR14 ( Bool mode64 );
+extern HReg hregPPC_GPR15 ( Bool mode64 );
+extern HReg hregPPC_GPR16 ( Bool mode64 );
+extern HReg hregPPC_GPR17 ( Bool mode64 );
+extern HReg hregPPC_GPR18 ( Bool mode64 );
+extern HReg hregPPC_GPR19 ( Bool mode64 );
+extern HReg hregPPC_GPR20 ( Bool mode64 );
+extern HReg hregPPC_GPR21 ( Bool mode64 );
+extern HReg hregPPC_GPR22 ( Bool mode64 );
+extern HReg hregPPC_GPR23 ( Bool mode64 );
+extern HReg hregPPC_GPR24 ( Bool mode64 );
+extern HReg hregPPC_GPR25 ( Bool mode64 );
+extern HReg hregPPC_GPR26 ( Bool mode64 );
+extern HReg hregPPC_GPR27 ( Bool mode64 );
+extern HReg hregPPC_GPR28 ( Bool mode64 );
+extern HReg hregPPC_GPR29 ( Bool mode64 ); // reserved for dispatcher
+extern HReg hregPPC_GPR30 ( Bool mode64 ); // used as VMX spill temp
+extern HReg hregPPC_GPR31 ( Bool mode64 ); // GuestStatePtr (callee-saved)
+
+extern HReg hregPPC_FPR0  ( void );
+extern HReg hregPPC_FPR1  ( void );
+extern HReg hregPPC_FPR2  ( void );
+extern HReg hregPPC_FPR3  ( void );
+extern HReg hregPPC_FPR4  ( void );
+extern HReg hregPPC_FPR5  ( void );
+extern HReg hregPPC_FPR6  ( void );
+extern HReg hregPPC_FPR7  ( void );
+extern HReg hregPPC_FPR8  ( void );
+extern HReg hregPPC_FPR9  ( void );
+extern HReg hregPPC_FPR10 ( void );
+extern HReg hregPPC_FPR11 ( void );
+extern HReg hregPPC_FPR12 ( void );
+extern HReg hregPPC_FPR13 ( void );
+extern HReg hregPPC_FPR14 ( void );
+extern HReg hregPPC_FPR15 ( void );
+extern HReg hregPPC_FPR16 ( void );
+extern HReg hregPPC_FPR17 ( void );
+extern HReg hregPPC_FPR18 ( void );
+extern HReg hregPPC_FPR19 ( void );
+extern HReg hregPPC_FPR20 ( void );
+extern HReg hregPPC_FPR21 ( void );
+extern HReg hregPPC_FPR22 ( void );
+extern HReg hregPPC_FPR23 ( void );
+extern HReg hregPPC_FPR24 ( void );
+extern HReg hregPPC_FPR25 ( void );
+extern HReg hregPPC_FPR26 ( void );
+extern HReg hregPPC_FPR27 ( void );
+extern HReg hregPPC_FPR28 ( void );
+extern HReg hregPPC_FPR29 ( void );
+extern HReg hregPPC_FPR30 ( void );
+extern HReg hregPPC_FPR31 ( void );
+
+extern HReg hregPPC_VR0  ( void );
+extern HReg hregPPC_VR1  ( void );
+extern HReg hregPPC_VR2  ( void );
+extern HReg hregPPC_VR3  ( void );
+extern HReg hregPPC_VR4  ( void );
+extern HReg hregPPC_VR5  ( void );
+extern HReg hregPPC_VR6  ( void );
+extern HReg hregPPC_VR7  ( void );
+extern HReg hregPPC_VR8  ( void );
+extern HReg hregPPC_VR9  ( void );
+extern HReg hregPPC_VR10 ( void );
+extern HReg hregPPC_VR11 ( void );
+extern HReg hregPPC_VR12 ( void );
+extern HReg hregPPC_VR13 ( void );
+extern HReg hregPPC_VR14 ( void );
+extern HReg hregPPC_VR15 ( void );
+extern HReg hregPPC_VR16 ( void );
+extern HReg hregPPC_VR17 ( void );
+extern HReg hregPPC_VR18 ( void );
+extern HReg hregPPC_VR19 ( void );
+extern HReg hregPPC_VR20 ( void );
+extern HReg hregPPC_VR21 ( void );
+extern HReg hregPPC_VR22 ( void );
+extern HReg hregPPC_VR23 ( void );
+extern HReg hregPPC_VR24 ( void );
+extern HReg hregPPC_VR25 ( void );
+extern HReg hregPPC_VR26 ( void );
+extern HReg hregPPC_VR27 ( void );
+extern HReg hregPPC_VR28 ( void );
+extern HReg hregPPC_VR29 ( void );
+extern HReg hregPPC_VR30 ( void );
+extern HReg hregPPC_VR31 ( void );
+
+#define StackFramePtr(_mode64) hregPPC_GPR1(_mode64)
+#define GuestStatePtr(_mode64) hregPPC_GPR31(_mode64)
+
+
+
+/* --------- Condition codes --------- */
+
+/* This gives names from bitfields in CR; hence it names BI numbers */
+/* Using IBM/hardware indexing convention */
+typedef
+   enum {
+      // CR7, which we use for integer compares
+      Pcf_7LT  = 28,  /* neg  | lt          */
+      Pcf_7GT  = 29,  /* pos  | gt          */
+      Pcf_7EQ  = 30,  /* zero | equal       */
+      Pcf_7SO  = 31   /* summary overflow   */
+   }
+   PPCCondFlag;
+
+typedef
+   enum {   /* Maps bc bitfield BO */
+      Pct_FALSE  = 0x4,
+      Pct_TRUE   = 0xC,
+      Pct_ALWAYS = 0x14
+   }
+   PPCCondTest;
+
+typedef
+   struct {
+      PPCCondFlag flag;
+      PPCCondTest test;
+   }
+   PPCCondCode;
+
+extern HChar* showPPCCondCode ( PPCCondCode );
+
+/* constructor */
+extern PPCCondCode mk_PPCCondCode ( PPCCondTest, PPCCondFlag );
+
+/* false->true, true->false */
+extern PPCCondTest invertCondTest ( PPCCondTest );
+
+
+
+
+/* --------- Memory address expressions (amodes). --------- */
+
+typedef
+   enum {
+     Pam_IR=1,      /* Immediate (signed 16-bit) + Reg */
+     Pam_RR=2       /* Reg1 + Reg2     */
+   }
+   PPCAModeTag;
+
+typedef
+   struct {
+      PPCAModeTag tag;
+      union {
+         struct {
+            HReg base;
+            Int  index;
+         } IR;
+         struct {
+            HReg base;
+            HReg index;
+         } RR;
+      } Pam;
+   }
+   PPCAMode;
+
+extern PPCAMode* PPCAMode_IR ( Int,  HReg );
+extern PPCAMode* PPCAMode_RR ( HReg, HReg );
+
+extern PPCAMode* dopyPPCAMode ( PPCAMode* );
+
+extern void ppPPCAMode ( PPCAMode* );
+
+
+/* --------- Operand, which can be a reg or a u16/s16. --------- */
+/* ("RH" == "Register or Halfword immediate") */
+typedef 
+   enum {
+      Prh_Imm=3,
+      Prh_Reg=4
+   }
+   PPCRHTag;
+
+typedef
+   struct {
+      PPCRHTag tag;
+      union {
+         struct {
+            Bool   syned;
+            UShort imm16;
+         } Imm;
+         struct {
+            HReg reg;
+         } Reg;
+      }
+      Prh;
+   }
+   PPCRH;
+
+extern PPCRH* PPCRH_Imm ( Bool, UShort );
+extern PPCRH* PPCRH_Reg ( HReg );
+
+extern void ppPPCRH ( PPCRH* );
+
+
+/* --------- Operand, which can be a reg or a u32/64. --------- */
+
+typedef
+   enum {
+      Pri_Imm=5,
+      Pri_Reg=6
+   } 
+   PPCRITag;
+
+typedef
+   struct {
+      PPCRITag tag;
+      union {
+         ULong Imm;
+         HReg  Reg;
+      }
+      Pri;
+   }
+   PPCRI;
+
+extern PPCRI* PPCRI_Imm ( ULong );
+extern PPCRI* PPCRI_Reg( HReg );
+
+extern void ppPPCRI ( PPCRI* );
+
+
+/* --------- Operand, which can be a vector reg or a s6. --------- */
+/* ("VI" == "Vector Register or Immediate") */
+typedef
+   enum {
+      Pvi_Imm=7,
+      Pvi_Reg=8
+   } 
+   PPCVI5sTag;
+
+typedef
+   struct {
+      PPCVI5sTag tag;
+      union {
+         Char Imm5s;
+         HReg Reg;
+      }
+      Pvi;
+   }
+   PPCVI5s;
+
+extern PPCVI5s* PPCVI5s_Imm ( Char );
+extern PPCVI5s* PPCVI5s_Reg ( HReg );
+
+extern void ppPPCVI5s ( PPCVI5s* );
+
+
+/* --------- Instructions. --------- */
+
+/* --------- */
+typedef
+   enum {
+      Pun_NEG,
+      Pun_NOT,
+      Pun_CLZ32,
+      Pun_CLZ64,
+      Pun_EXTSW
+   }
+   PPCUnaryOp;
+
+extern HChar* showPPCUnaryOp ( PPCUnaryOp );
+
+
+/* --------- */
+typedef 
+   enum {
+      Palu_INVALID,
+      Palu_ADD, Palu_SUB,
+      Palu_AND, Palu_OR, Palu_XOR,
+   }
+   PPCAluOp;
+
+extern 
+HChar* showPPCAluOp ( PPCAluOp, 
+                      Bool /* is the 2nd operand an immediate? */);
+
+
+/* --------- */
+typedef 
+   enum {
+      Pshft_INVALID,
+      Pshft_SHL, Pshft_SHR, Pshft_SAR, 
+   }
+   PPCShftOp;
+
+extern 
+HChar* showPPCShftOp ( PPCShftOp, 
+                       Bool /* is the 2nd operand an immediate? */,
+                       Bool /* is this a 32bit or 64bit op? */ );
+
+
+/* --------- */
+typedef
+   enum {
+      Pfp_INVALID,
+
+      /* Ternary */
+      Pfp_MADDD, Pfp_MSUBD, 
+      Pfp_MADDS, Pfp_MSUBS,
+
+      /* Binary */
+      Pfp_ADDD, Pfp_SUBD, Pfp_MULD, Pfp_DIVD, 
+      Pfp_ADDS, Pfp_SUBS, Pfp_MULS, Pfp_DIVS, 
+
+      /* Unary */
+      Pfp_SQRT, Pfp_ABS, Pfp_NEG, Pfp_MOV, Pfp_RES, Pfp_RSQRTE,
+      Pfp_FRIN, Pfp_FRIM, Pfp_FRIP, Pfp_FRIZ
+   }
+   PPCFpOp;
+
+extern HChar* showPPCFpOp ( PPCFpOp );
+
+
+/* --------- */
+typedef
+   enum {
+      Pav_INVALID,
+
+      /* Integer Unary */
+      Pav_MOV,                             /* Mov */
+      Pav_NOT,                             /* Bitwise */
+      Pav_UNPCKH8S,  Pav_UNPCKH16S,        /* Unpack */
+      Pav_UNPCKL8S,  Pav_UNPCKL16S,
+      Pav_UNPCKHPIX, Pav_UNPCKLPIX,
+
+      /* Integer Binary */
+      Pav_AND, Pav_OR, Pav_XOR,            /* Bitwise */
+      Pav_ADDU, Pav_QADDU, Pav_QADDS,
+      Pav_SUBU, Pav_QSUBU, Pav_QSUBS,
+      Pav_OMULU, Pav_OMULS, Pav_EMULU, Pav_EMULS,
+      Pav_AVGU, Pav_AVGS,
+      Pav_MAXU, Pav_MAXS,
+      Pav_MINU, Pav_MINS,
+
+      /* Compare (always affects CR field 6) */
+      Pav_CMPEQU, Pav_CMPGTU, Pav_CMPGTS,
+
+      /* Shift */
+      Pav_SHL, Pav_SHR, Pav_SAR, Pav_ROTL,
+
+      /* Pack */
+      Pav_PACKUU, Pav_QPACKUU, Pav_QPACKSU, Pav_QPACKSS,
+      Pav_PACKPXL,
+
+      /* Merge */
+      Pav_MRGHI, Pav_MRGLO,
+   }
+   PPCAvOp;
+
+extern HChar* showPPCAvOp ( PPCAvOp );
+
+
+/* --------- */
+typedef
+   enum {
+      Pavfp_INVALID,
+
+      /* Floating point binary */
+      Pavfp_ADDF, Pavfp_SUBF, Pavfp_MULF,
+      Pavfp_MAXF, Pavfp_MINF,
+      Pavfp_CMPEQF, Pavfp_CMPGTF, Pavfp_CMPGEF,
+
+      /* Floating point unary */
+      Pavfp_RCPF, Pavfp_RSQRTF,
+      Pavfp_CVTU2F, Pavfp_CVTS2F, Pavfp_QCVTF2U, Pavfp_QCVTF2S,
+      Pavfp_ROUNDM, Pavfp_ROUNDP, Pavfp_ROUNDN, Pavfp_ROUNDZ,
+   }
+   PPCAvFpOp;
+
+extern HChar* showPPCAvFpOp ( PPCAvFpOp );
+
+
+/* --------- */
+typedef
+   enum {
+      Pin_LI,         /* load word (32/64-bit) immediate (fake insn) */
+      Pin_Alu,        /* word add/sub/and/or/xor */
+      Pin_Shft,       /* word shl/shr/sar */
+      Pin_AddSubC,    /* add/sub with read/write carry */
+      Pin_Cmp,        /* word compare */
+      Pin_Unary,      /* not, neg, clz */
+      Pin_MulL,       /* widening multiply */
+      Pin_Div,        /* div */
+      Pin_Call,       /* call to address in register */
+      Pin_Goto,       /* conditional/unconditional jmp to dst */
+      Pin_CMov,       /* conditional move */
+      Pin_Load,       /* zero-extending load a 8|16|32|64 bit value from mem */
+      Pin_LoadL,      /* load-linked (lwarx/ldarx) 32|64 bit value from mem */
+      Pin_Store,      /* store a 8|16|32|64 bit value to mem */
+      Pin_StoreC,     /* store-conditional (stwcx./stdcx.) 32|64 bit val */
+      Pin_Set,        /* convert condition code to value 0 or 1 */
+      Pin_MfCR,       /* move from condition register to GPR */
+      Pin_MFence,     /* mem fence */
+
+      Pin_FpUnary,    /* FP unary op */
+      Pin_FpBinary,   /* FP binary op */
+      Pin_FpMulAcc,   /* FP multipy-accumulate style op */
+      Pin_FpLdSt,     /* FP load/store */
+      Pin_FpSTFIW,    /* stfiwx */
+      Pin_FpRSP,      /* FP round IEEE754 double to IEEE754 single */
+      Pin_FpCftI,     /* fcfid/fctid/fctiw */
+      Pin_FpCMov,     /* FP floating point conditional move */
+      Pin_FpLdFPSCR,  /* mtfsf */
+      Pin_FpCmp,      /* FP compare, generating value into int reg */
+
+      Pin_RdWrLR,     /* Read/Write Link Register */
+
+      Pin_AvLdSt,     /* AV load/store (kludging for AMode_IR) */
+      Pin_AvUnary,    /* AV unary general reg=>reg */
+
+      Pin_AvBinary,   /* AV binary general reg,reg=>reg */
+      Pin_AvBin8x16,  /* AV binary, 8x4 */
+      Pin_AvBin16x8,  /* AV binary, 16x4 */
+      Pin_AvBin32x4,  /* AV binary, 32x4 */
+
+      Pin_AvBin32Fx4, /* AV FP binary, 32Fx4 */
+      Pin_AvUn32Fx4,  /* AV FP unary,  32Fx4 */
+
+      Pin_AvPerm,     /* AV permute (shuffle) */
+      Pin_AvSel,      /* AV select */
+      Pin_AvShlDbl,   /* AV shift-left double by imm */
+      Pin_AvSplat,    /* One elem repeated throughout dst */
+      Pin_AvLdVSCR,   /* mtvscr */
+      Pin_AvCMov      /* AV conditional move */
+   }
+   PPCInstrTag;
+
+/* Destinations are on the LEFT (first operand) */
+
+typedef
+   struct {
+      PPCInstrTag tag;
+      union {
+         /* Get a 32/64-bit literal into a register.
+            May turn into a number of real insns. */
+         struct {
+            HReg dst;
+            ULong imm64;
+         } LI;
+         /* Integer add/sub/and/or/xor.  Limitations:
+            - For add, the immediate, if it exists, is a signed 16.
+            - For sub, the immediate, if it exists, is a signed 16
+              which may not be -32768, since no such instruction 
+              exists, and so we have to emit addi with +32768, but 
+              that is not possible.
+            - For and/or/xor,  the immediate, if it exists, 
+              is an unsigned 16.
+         */
+         struct {
+            PPCAluOp op;
+            HReg     dst;
+            HReg     srcL;
+            PPCRH*   srcR;
+         } Alu;
+         /* Integer shl/shr/sar.
+            Limitations: the immediate, if it exists,
+            is a signed 5-bit value between 1 and 31 inclusive.
+         */
+         struct {
+            PPCShftOp op;
+            Bool      sz32;   /* mode64 has both 32 and 64bit shft */
+            HReg      dst;
+            HReg      srcL;
+            PPCRH*    srcR;
+         } Shft;
+         /*  */
+         struct {
+            Bool isAdd;  /* else sub */
+            Bool setC;   /* else read carry */
+            HReg dst;
+            HReg srcL;
+            HReg srcR;
+         } AddSubC;
+         /* If signed, the immediate, if it exists, is a signed 16,
+            else it is an unsigned 16. */
+         struct {
+            Bool   syned;
+            Bool   sz32;    /* mode64 has both 32 and 64bit cmp */
+            UInt   crfD;
+            HReg   srcL;
+            PPCRH* srcR;
+         } Cmp;
+         /* Not, Neg, Clz32/64, Extsw */
+         struct {
+            PPCUnaryOp op;
+            HReg       dst;
+            HReg       src;
+         } Unary;
+         struct {
+            Bool syned;  /* meaningless if hi32==False */
+            Bool hi;     /* False=>low, True=>high */
+            Bool sz32;   /* mode64 has both 32 & 64bit mull */
+            HReg dst;
+            HReg srcL;
+            HReg srcR;
+         } MulL;
+         /* ppc32 div/divu instruction. */
+         struct {
+            Bool syned;
+            Bool sz32;   /* mode64 has both 32 & 64bit div */
+            HReg dst;
+            HReg srcL;
+            HReg srcR;
+         } Div;
+         /* Pseudo-insn.  Call target (an absolute address), on given
+            condition (which could be Pct_ALWAYS).  argiregs indicates
+            which of r3 .. r10 carries argument values for this call,
+            using a bit mask (1<<N is set if rN holds an arg, for N in
+            3 .. 10 inclusive). */
+         struct {
+            PPCCondCode cond;
+            Addr64      target;
+            UInt        argiregs;
+         } Call;
+         /* Pseudo-insn.  Goto dst, on given condition (which could be
+            Pct_ALWAYS). */
+         struct {
+            IRJumpKind  jk;
+            PPCCondCode cond;
+            PPCRI*      dst;
+         } Goto;
+         /* Mov src to dst on the given condition, which may not
+            be the bogus Pct_ALWAYS. */
+         struct {
+            PPCCondCode cond;
+            HReg        dst;
+            PPCRI*      src;
+         } CMov;
+         /* Zero extending loads.  Dst size is host word size */
+         struct {
+            UChar     sz; /* 1|2|4|8 */
+            HReg      dst;
+            PPCAMode* src;
+         } Load;
+         /* Load-and-reserve (lwarx, ldarx) */
+         struct {
+            UChar sz; /* 4|8 */
+            HReg  dst;
+            HReg  src;
+         } LoadL;
+         /* 64/32/16/8 bit stores */
+         struct {
+            UChar     sz; /* 1|2|4|8 */
+            PPCAMode* dst;
+            HReg      src;
+         } Store;
+         /* Store-conditional (stwcx., stdcx.) */
+         struct {
+            UChar sz; /* 4|8 */
+            HReg  dst;
+            HReg  src;
+         } StoreC;
+         /* Convert a ppc condition code to value 0 or 1. */
+         struct {
+            PPCCondCode cond;
+            HReg        dst;
+         } Set;
+         /* Move the entire CR to a GPR */
+         struct {
+            HReg dst;
+         } MfCR;
+         /* Mem fence.  In short, an insn which flushes all preceding
+            loads and stores as much as possible before continuing.
+            On PPC we emit a "sync". */
+         struct {
+         } MFence;
+
+         /* PPC Floating point */
+         struct {
+            PPCFpOp op;
+            HReg    dst;
+            HReg    src;
+         } FpUnary;
+         struct {
+            PPCFpOp op;
+            HReg    dst;
+            HReg    srcL;
+            HReg    srcR;
+         } FpBinary;
+         struct {
+            PPCFpOp op;
+            HReg    dst;
+            HReg    srcML;
+            HReg    srcMR;
+            HReg    srcAcc;
+         } FpMulAcc;
+         struct {
+            Bool      isLoad;
+            UChar     sz; /* only 4 (IEEE single) or 8 (IEEE double) */
+            HReg      reg;
+            PPCAMode* addr;
+         } FpLdSt;
+         struct {
+            HReg addr; /* int reg */
+            HReg data; /* float reg */
+         } FpSTFIW;
+         /* Round 64-bit FP value to 32-bit FP value in an FP reg. */
+         struct {
+            HReg src;
+            HReg dst;
+         } FpRSP;
+         /* fcfid/fctid/fctiw.  Note there's no fcfiw so fromI==True
+            && int32==True is not allowed. */
+         struct {
+            Bool fromI; /* False==F->I, True==I->F */
+            Bool int32; /* True== I is 32, False==I is 64 */
+            HReg src;
+            HReg dst;
+         } FpCftI;
+         /* FP mov src to dst on the given condition. */
+         struct {
+            PPCCondCode cond;
+            HReg        dst;
+            HReg        src;
+         } FpCMov;
+         /* Load FP Status & Control Register */
+         struct {
+            HReg src;
+         } FpLdFPSCR;
+         /* Do a compare, generating result into an int register. */
+         struct {
+            UChar crfD;
+            HReg  dst;
+            HReg  srcL;
+            HReg  srcR;
+         } FpCmp;
+
+         /* Read/Write Link Register */
+         struct {
+            Bool wrLR;
+            HReg gpr;
+         } RdWrLR;
+
+         /* Simplistic AltiVec */
+         struct {
+            Bool      isLoad;
+            UChar     sz;      /* 8|16|32|128 */
+            HReg      reg;
+            PPCAMode* addr;
+         } AvLdSt;
+         struct {
+            PPCAvOp op;
+            HReg    dst;
+            HReg    src;
+         } AvUnary;
+         struct {
+            PPCAvOp op;
+            HReg    dst;
+            HReg    srcL;
+            HReg    srcR;
+         } AvBinary;
+         struct {
+            PPCAvOp op;
+            HReg    dst;
+            HReg    srcL;
+            HReg    srcR;
+         } AvBin8x16;
+         struct {
+            PPCAvOp op;
+            HReg    dst;
+            HReg    srcL;
+            HReg    srcR;
+         } AvBin16x8;
+         struct {
+            PPCAvOp op;
+            HReg    dst;
+            HReg    srcL;
+            HReg    srcR;
+         } AvBin32x4;
+         struct {
+            PPCAvFpOp op;
+            HReg      dst;
+            HReg      srcL;
+            HReg      srcR;
+         } AvBin32Fx4;
+         struct {
+            PPCAvFpOp op;
+            HReg      dst;
+            HReg      src;
+         } AvUn32Fx4;
+         /* Perm,Sel,SlDbl,Splat are all weird AV permutations */
+         struct {
+            HReg dst;
+            HReg srcL;
+            HReg srcR;
+            HReg ctl;
+         } AvPerm;
+         struct {
+            HReg dst;
+            HReg srcL;
+            HReg srcR;
+            HReg ctl;
+         } AvSel;
+         struct {
+            UChar shift;
+            HReg  dst;
+            HReg  srcL;
+            HReg  srcR;
+         } AvShlDbl;
+         struct {
+            UChar    sz;   /* 8,16,32 */
+            HReg     dst;
+            PPCVI5s* src; 
+         } AvSplat;
+         /* Mov src to dst on the given condition, which may not
+            be the bogus Xcc_ALWAYS. */
+         struct {
+            PPCCondCode cond;
+            HReg        dst;
+            HReg        src;
+         } AvCMov;
+         /* Load AltiVec Status & Control Register */
+         struct {
+            HReg src;
+         } AvLdVSCR;
+       } Pin;
+   }
+   PPCInstr;
+
+
+extern PPCInstr* PPCInstr_LI         ( HReg, ULong, Bool );
+extern PPCInstr* PPCInstr_Alu        ( PPCAluOp, HReg, HReg, PPCRH* );
+extern PPCInstr* PPCInstr_Shft       ( PPCShftOp, Bool sz32, HReg, HReg, PPCRH* );
+extern PPCInstr* PPCInstr_AddSubC    ( Bool, Bool, HReg, HReg, HReg );
+extern PPCInstr* PPCInstr_Cmp        ( Bool, Bool, UInt, HReg, PPCRH* );
+extern PPCInstr* PPCInstr_Unary      ( PPCUnaryOp op, HReg dst, HReg src );
+extern PPCInstr* PPCInstr_MulL       ( Bool syned, Bool hi32, Bool sz32, HReg, HReg, HReg );
+extern PPCInstr* PPCInstr_Div        ( Bool syned, Bool sz32, HReg dst, HReg srcL, HReg srcR );
+extern PPCInstr* PPCInstr_Call       ( PPCCondCode, Addr64, UInt );
+extern PPCInstr* PPCInstr_Goto       ( IRJumpKind, PPCCondCode cond, PPCRI* dst );
+extern PPCInstr* PPCInstr_CMov       ( PPCCondCode, HReg dst, PPCRI* src );
+extern PPCInstr* PPCInstr_Load       ( UChar sz,
+                                       HReg dst, PPCAMode* src, Bool mode64 );
+extern PPCInstr* PPCInstr_LoadL      ( UChar sz,
+                                       HReg dst, HReg src, Bool mode64 );
+extern PPCInstr* PPCInstr_Store      ( UChar sz, PPCAMode* dst,
+                                       HReg src, Bool mode64 );
+extern PPCInstr* PPCInstr_StoreC     ( UChar sz, HReg dst, HReg src,
+                                       Bool mode64 );
+extern PPCInstr* PPCInstr_Set        ( PPCCondCode cond, HReg dst );
+extern PPCInstr* PPCInstr_MfCR       ( HReg dst );
+extern PPCInstr* PPCInstr_MFence     ( void );
+
+extern PPCInstr* PPCInstr_FpUnary    ( PPCFpOp op, HReg dst, HReg src );
+extern PPCInstr* PPCInstr_FpBinary   ( PPCFpOp op, HReg dst, HReg srcL, HReg srcR );
+extern PPCInstr* PPCInstr_FpMulAcc   ( PPCFpOp op, HReg dst, HReg srcML, 
+                                                   HReg srcMR, HReg srcAcc );
+extern PPCInstr* PPCInstr_FpLdSt     ( Bool isLoad, UChar sz, HReg, PPCAMode* );
+extern PPCInstr* PPCInstr_FpSTFIW    ( HReg addr, HReg data );
+extern PPCInstr* PPCInstr_FpRSP      ( HReg dst, HReg src );
+extern PPCInstr* PPCInstr_FpCftI     ( Bool fromI, Bool int32, 
+                                       HReg dst, HReg src );
+extern PPCInstr* PPCInstr_FpCMov     ( PPCCondCode, HReg dst, HReg src );
+extern PPCInstr* PPCInstr_FpLdFPSCR  ( HReg src );
+extern PPCInstr* PPCInstr_FpCmp      ( HReg dst, HReg srcL, HReg srcR );
+
+extern PPCInstr* PPCInstr_RdWrLR     ( Bool wrLR, HReg gpr );
+
+extern PPCInstr* PPCInstr_AvLdSt     ( Bool isLoad, UChar sz, HReg, PPCAMode* );
+extern PPCInstr* PPCInstr_AvUnary    ( PPCAvOp op, HReg dst, HReg src );
+extern PPCInstr* PPCInstr_AvBinary   ( PPCAvOp op, HReg dst, HReg srcL, HReg srcR );
+extern PPCInstr* PPCInstr_AvBin8x16  ( PPCAvOp op, HReg dst, HReg srcL, HReg srcR );
+extern PPCInstr* PPCInstr_AvBin16x8  ( PPCAvOp op, HReg dst, HReg srcL, HReg srcR );
+extern PPCInstr* PPCInstr_AvBin32x4  ( PPCAvOp op, HReg dst, HReg srcL, HReg srcR );
+extern PPCInstr* PPCInstr_AvBin32Fx4 ( PPCAvOp op, HReg dst, HReg srcL, HReg srcR );
+extern PPCInstr* PPCInstr_AvUn32Fx4  ( PPCAvOp op, HReg dst, HReg src );
+extern PPCInstr* PPCInstr_AvPerm     ( HReg dst, HReg srcL, HReg srcR, HReg ctl );
+extern PPCInstr* PPCInstr_AvSel      ( HReg ctl, HReg dst, HReg srcL, HReg srcR );
+extern PPCInstr* PPCInstr_AvShlDbl   ( UChar shift, HReg dst, HReg srcL, HReg srcR );
+extern PPCInstr* PPCInstr_AvSplat    ( UChar sz, HReg dst, PPCVI5s* src );
+extern PPCInstr* PPCInstr_AvCMov     ( PPCCondCode, HReg dst, HReg src );
+extern PPCInstr* PPCInstr_AvLdVSCR   ( HReg src );
+
+extern void ppPPCInstr ( PPCInstr*, Bool mode64 );
+
+/* Some functions that insulate the register allocator from details
+   of the underlying instruction set. */
+extern void         getRegUsage_PPCInstr ( HRegUsage*, PPCInstr*, Bool mode64 );
+extern void         mapRegs_PPCInstr     ( HRegRemap*, PPCInstr* , Bool mode64);
+extern Bool         isMove_PPCInstr      ( PPCInstr*, HReg*, HReg* );
+extern Int          emit_PPCInstr        ( UChar* buf, Int nbuf, PPCInstr*, 
+                                           Bool mode64, void* dispatch );
+
+extern void genSpill_PPC  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                            HReg rreg, Int offsetB, Bool mode64 );
+extern void genReload_PPC ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                            HReg rreg, Int offsetB, Bool mode64 );
+
+extern void         getAllocableRegs_PPC ( Int*, HReg**, Bool mode64 );
+extern HInstrArray* iselSB_PPC           ( IRSB*, VexArch,
+                                                  VexArchInfo*,
+                                                  VexAbiInfo* );
+
+#endif /* ndef __VEX_HOST_PPC_DEFS_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                     host_ppc_defs.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_ppc_isel.c b/VEX/priv/host_ppc_isel.c
new file mode 100644
index 0000000..4ae18f3
--- /dev/null
+++ b/VEX/priv/host_ppc_isel.c

@@ -0,0 +1,4206 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                   host_ppc_isel.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+
+#include "ir_match.h"
+#include "main_util.h"
+#include "main_globals.h"
+#include "host_generic_regs.h"
+#include "host_ppc_defs.h"
+
+/* GPR register class for ppc32/64 */
+#define HRcGPR(__mode64) (__mode64 ? HRcInt64 : HRcInt32)
+
+
+/*---------------------------------------------------------*/
+/*--- Register Usage Conventions                        ---*/
+/*---------------------------------------------------------*/
+/*
+  Integer Regs
+  ------------
+  GPR0       Reserved
+  GPR1       Stack Pointer
+  GPR2       not used - TOC pointer
+  GPR3:10    Allocateable
+  GPR11      if mode64: not used - calls by ptr / env ptr for some langs
+  GPR12      if mode64: not used - exceptions / global linkage code
+  GPR13      not used - Thread-specific pointer
+  GPR14:28   Allocateable
+  GPR29      Unused by us (reserved for the dispatcher)
+  GPR30      AltiVec temp spill register
+  GPR31      GuestStatePointer
+
+  Of Allocateable regs:
+  if (mode64)
+    GPR3:10  Caller-saved regs
+  else
+    GPR3:12  Caller-saved regs
+  GPR14:29   Callee-saved regs
+
+  GPR3       [Return | Parameter] - carrying reg
+  GPR4:10    Parameter-carrying regs
+
+
+  Floating Point Regs
+  -------------------
+  FPR0:31    Allocateable
+
+  FPR0       Caller-saved - scratch reg
+  if (mode64)
+    FPR1:13  Caller-saved - param & return regs
+  else
+    FPR1:8   Caller-saved - param & return regs
+    FPR9:13  Caller-saved regs
+  FPR14:31   Callee-saved regs
+
+
+  Vector Regs (on processors with the VMX feature)
+  -----------
+  VR0-VR1    Volatile scratch registers
+  VR2-VR13   Volatile vector parameters registers
+  VR14-VR19  Volatile scratch registers
+  VR20-VR31  Non-volatile registers
+  VRSAVE     Non-volatile 32-bit register
+*/
+
+
+/*---------------------------------------------------------*/
+/*--- PPC FP Status & Control Register Conventions      ---*/
+/*---------------------------------------------------------*/
+/*
+  Vex-generated code expects to run with the FPU set as follows: all
+  exceptions masked.  The rounding mode is set appropriately before
+  each floating point insn emitted (or left unchanged if known to be
+  correct already).  There are a few fp insns (fmr,fneg,fabs,fnabs),
+  which are unaffected by the rm and so the rounding mode is not set
+  prior to them.  
+
+  At least on MPC7447A (Mac Mini), frsqrte is also not affected by
+  rounding mode.  At some point the ppc docs get sufficiently vague
+  that the only way to find out is to write test programs.
+*/
+/* Notes on the FP instruction set, 6 Feb 06.
+
+What                 exns -> CR1 ?   Sets FPRF ?   Observes RM ?
+-------------------------------------------------------------
+
+fmr[.]                   if .             n             n
+fneg[.]                  if .             n             n
+fabs[.]                  if .             n             n
+fnabs[.]                 if .             n             n
+
+fadd[.]                  if .             y             y
+fadds[.]                 if .             y             y
+fcfid[.] (i64->dbl)      if .             y             y
+fcmpo (cmp, result       n                n             n
+fcmpu  to crfD)          n                n             n
+fctid[.]  (dbl->i64)     if .       ->undef             y
+fctidz[.] (dbl->i64)     if .       ->undef    rounds-to-zero
+fctiw[.]  (dbl->i32)     if .       ->undef             y
+fctiwz[.] (dbl->i32)     if .       ->undef    rounds-to-zero
+fdiv[.]                  if .             y             y
+fdivs[.]                 if .             y             y
+fmadd[.]                 if .             y             y
+fmadds[.]                if .             y             y
+fmsub[.]                 if .             y             y
+fmsubs[.]                if .             y             y
+fmul[.]                  if .             y             y
+fmuls[.]                 if .             y             y
+
+(note: for fnm*, rounding happens before final negation)
+fnmadd[.]                if .             y             y
+fnmadds[.]               if .             y             y
+fnmsub[.]                if .             y             y
+fnmsubs[.]               if .             y             y
+
+fre[.]                   if .             y             y
+fres[.]                  if .             y             y
+
+frsqrte[.]               if .             y       apparently not
+
+fsqrt[.]                 if .             y             y
+fsqrts[.]                if .             y             y
+fsub[.]                  if .             y             y
+fsubs[.]                 if .             y             y
+
+
+fpscr: bits 30-31 (ibm) is RM
+            24-29 (ibm) are exnmasks/non-IEEE bit, all zero
+	    15-19 (ibm) is FPRF: class, <, =, >, UNord
+
+ppc fe(guest) makes fpscr read as all zeros except RM (and maybe FPRF
+in future) 
+
+mcrfs     - move fpscr field to CR field
+mtfsfi[.] - 4 bit imm moved to fpscr field
+mtfsf[.]  - move frS[low 1/2] to fpscr but using 8-bit field mask
+mtfsb1[.] - set given fpscr bit
+mtfsb0[.] - clear given fpscr bit
+mffs[.]   - move all fpscr to frD[low 1/2]
+
+For [.] presumably cr1 is set with exn summary bits, as per 
+main FP insns
+
+A single precision store truncates/denormalises the in-register value,
+but does not round it.  This is so that flds followed by fsts is
+always the identity.
+*/
+
+
+/*---------------------------------------------------------*/
+/*--- misc helpers                                      ---*/
+/*---------------------------------------------------------*/
+
+/* These are duplicated in guest-ppc/toIR.c */
+static IRExpr* unop ( IROp op, IRExpr* a )
+{
+   return IRExpr_Unop(op, a);
+}
+
+static IRExpr* mkU32 ( UInt i )
+{
+   return IRExpr_Const(IRConst_U32(i));
+}
+
+static IRExpr* bind ( Int binder )
+{
+   return IRExpr_Binder(binder);
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISelEnv                                           ---*/
+/*---------------------------------------------------------*/
+
+/* This carries around:
+
+   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
+     might encounter.  This is computed before insn selection starts,
+     and does not change.
+
+   - A mapping from IRTemp to HReg.  This tells the insn selector
+     which virtual register(s) are associated with each IRTemp
+      temporary.  This is computed before insn selection starts, and
+      does not change.  We expect this mapping to map precisely the
+      same set of IRTemps as the type mapping does.
+ 
+         - vregmap   holds the primary register for the IRTemp.
+         - vregmapHI holds the secondary register for the IRTemp,
+              if any is needed.  That's only for Ity_I64 temps
+              in 32 bit mode or Ity_I128 temps in 64-bit mode.
+
+    - The name of the vreg in which we stash a copy of the link reg,
+      so helper functions don't kill it.
+
+    - The code array, that is, the insns selected so far.
+ 
+    - A counter, for generating new virtual registers.
+ 
+    - The host subarchitecture we are selecting insns for.  
+      This is set at the start and does not change.
+ 
+    - A Bool to tell us if the host is 32 or 64bit.
+      This is set at the start and does not change.
+ 
+    - An IRExpr*, which may be NULL, holding the IR expression (an
+      IRRoundingMode-encoded value) to which the FPU's rounding mode
+      was most recently set.  Setting to NULL is always safe.  Used to
+      avoid redundant settings of the FPU's rounding mode, as
+      described in set_FPU_rounding_mode below.
+
+    - A VexMiscInfo*, needed for knowing how to generate
+      function calls for this target
+*/
+ 
+typedef
+   struct {
+      IRTypeEnv*   type_env;
+ 
+      HReg*        vregmap;
+      HReg*        vregmapHI;
+      Int          n_vregmap;
+ 
+      HReg         savedLR;
+
+      HInstrArray* code;
+ 
+      Int          vreg_ctr;
+ 
+      /* 27 Jan 06: Not currently used, but should be */
+      UInt         hwcaps;
+
+      Bool         mode64;
+
+      IRExpr*      previous_rm;
+
+      VexAbiInfo*  vbi;
+   }
+   ISelEnv;
+ 
+ 
+static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
+{
+   vassert(tmp >= 0);
+   vassert(tmp < env->n_vregmap);
+   return env->vregmap[tmp];
+}
+
+static void lookupIRTempPair ( HReg* vrHI, HReg* vrLO,
+                               ISelEnv* env, IRTemp tmp )
+{
+   vassert(!env->mode64);
+   vassert(tmp >= 0);
+   vassert(tmp < env->n_vregmap);
+   vassert(env->vregmapHI[tmp] != INVALID_HREG);
+   *vrLO = env->vregmap[tmp];
+   *vrHI = env->vregmapHI[tmp];
+}
+
+static void addInstr ( ISelEnv* env, PPCInstr* instr )
+{
+   addHInstr(env->code, instr);
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      ppPPCInstr(instr, env->mode64);
+      vex_printf("\n");
+   }
+}
+
+static HReg newVRegI ( ISelEnv* env )
+{   
+   HReg reg = mkHReg(env->vreg_ctr, HRcGPR(env->mode64),
+                     True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+static HReg newVRegF ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+static HReg newVRegV ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Forward declarations                        ---*/
+/*---------------------------------------------------------*/
+
+/* These are organised as iselXXX and iselXXX_wrk pairs.  The
+   iselXXX_wrk do the real work, but are not to be called directly.
+   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
+   checks that all returned registers are virtual.  You should not
+   call the _wrk version directly.
+
+   'Word' refers to the size of the native machine word, that is,
+   32-bit int in 32-bit mode and 64-bit int in 64-bit mode.  '2Word'
+   therefore refers to a double-width (64/128-bit) quantity in two
+   integer registers.
+*/
+/* 32-bit mode: compute an I8/I16/I32 into a GPR.
+   64-bit mode: compute an I8/I16/I32/I64 into a GPR. */
+static HReg          iselWordExpr_R_wrk ( ISelEnv* env, IRExpr* e );
+static HReg          iselWordExpr_R     ( ISelEnv* env, IRExpr* e );
+
+/* 32-bit mode: Compute an I8/I16/I32 into a RH
+                (reg-or-halfword-immediate).
+   64-bit mode: Compute an I8/I16/I32/I64 into a RH
+                (reg-or-halfword-immediate).
+   It's important to specify whether the immediate is to be regarded
+   as signed or not.  If yes, this will never return -32768 as an
+   immediate; this guaranteed that all signed immediates that are
+   return can have their sign inverted if need be. 
+*/
+static PPCRH*        iselWordExpr_RH_wrk ( ISelEnv* env, 
+                                           Bool syned, IRExpr* e );
+static PPCRH*        iselWordExpr_RH     ( ISelEnv* env, 
+                                           Bool syned, IRExpr* e );
+
+/* 32-bit mode: compute an I32 into a RI (reg or 32-bit immediate).
+   64-bit mode: compute an I64 into a RI (reg or 64-bit immediate). */
+static PPCRI*        iselWordExpr_RI_wrk ( ISelEnv* env, IRExpr* e );
+static PPCRI*        iselWordExpr_RI     ( ISelEnv* env, IRExpr* e );
+
+/* In 32 bit mode ONLY, compute an I8 into a
+   reg-or-5-bit-unsigned-immediate, the latter being an immediate in
+   the range 1 .. 31 inclusive.  Used for doing shift amounts. */
+static PPCRH*        iselWordExpr_RH5u_wrk ( ISelEnv* env, IRExpr* e );
+static PPCRH*        iselWordExpr_RH5u     ( ISelEnv* env, IRExpr* e );
+
+/* In 64-bit mode ONLY, compute an I8 into a
+   reg-or-6-bit-unsigned-immediate, the latter being an immediate in
+   the range 1 .. 63 inclusive.  Used for doing shift amounts. */
+static PPCRH*        iselWordExpr_RH6u_wrk ( ISelEnv* env, IRExpr* e );
+static PPCRH*        iselWordExpr_RH6u     ( ISelEnv* env, IRExpr* e );
+
+/* 32-bit mode: compute an I32 into an AMode.
+   64-bit mode: compute an I64 into an AMode.
+
+   Requires to know (xferTy) the type of data to be loaded/stored
+   using this amode.  That is so that, for 64-bit code generation, any
+   PPCAMode_IR returned will have an index (immediate offset) field
+   that is guaranteed to be 4-aligned, if there is any chance that the
+   amode is to be used in ld/ldu/lda/std/stdu.
+
+   Since there are no such restrictions on 32-bit insns, xferTy is
+   ignored for 32-bit code generation. */
+static PPCAMode*     iselWordExpr_AMode_wrk ( ISelEnv* env, IRExpr* e, IRType xferTy );
+static PPCAMode*     iselWordExpr_AMode     ( ISelEnv* env, IRExpr* e, IRType xferTy );
+
+/* 32-bit mode ONLY: compute an I64 into a GPR pair. */
+static void          iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, 
+                                         ISelEnv* env, IRExpr* e );
+static void          iselInt64Expr     ( HReg* rHi, HReg* rLo, 
+                                         ISelEnv* env, IRExpr* e );
+
+/* 64-bit mode ONLY: compute an I128 into a GPR64 pair. */
+static void          iselInt128Expr_wrk ( HReg* rHi, HReg* rLo, 
+                                          ISelEnv* env, IRExpr* e );
+static void          iselInt128Expr     ( HReg* rHi, HReg* rLo, 
+                                          ISelEnv* env, IRExpr* e );
+
+static PPCCondCode   iselCondCode_wrk ( ISelEnv* env, IRExpr* e );
+static PPCCondCode   iselCondCode     ( ISelEnv* env, IRExpr* e );
+
+static HReg          iselDblExpr_wrk ( ISelEnv* env, IRExpr* e );
+static HReg          iselDblExpr     ( ISelEnv* env, IRExpr* e );
+
+static HReg          iselFltExpr_wrk ( ISelEnv* env, IRExpr* e );
+static HReg          iselFltExpr     ( ISelEnv* env, IRExpr* e );
+
+static HReg          iselVecExpr_wrk ( ISelEnv* env, IRExpr* e );
+static HReg          iselVecExpr     ( ISelEnv* env, IRExpr* e );
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Misc helpers                                ---*/
+/*---------------------------------------------------------*/
+
+/* Make an int reg-reg move. */
+
+static PPCInstr* mk_iMOVds_RR ( HReg r_dst, HReg r_src )
+{
+   vassert(hregClass(r_dst) == hregClass(r_src));
+   vassert(hregClass(r_src) ==  HRcInt32 ||
+           hregClass(r_src) ==  HRcInt64);
+   return PPCInstr_Alu(Palu_OR, r_dst, r_src, PPCRH_Reg(r_src));
+}
+
+/* Advance/retreat %r1 by n. */
+
+static void add_to_sp ( ISelEnv* env, UInt n )
+{
+   HReg sp = StackFramePtr(env->mode64);
+   vassert(n < 256 && (n%16) == 0);
+   addInstr(env, PPCInstr_Alu( Palu_ADD, sp, sp,
+                               PPCRH_Imm(True,toUShort(n)) ));
+}
+
+static void sub_from_sp ( ISelEnv* env, UInt n )
+{
+   HReg sp = StackFramePtr(env->mode64);
+   vassert(n < 256 && (n%16) == 0);
+   addInstr(env, PPCInstr_Alu( Palu_SUB, sp, sp,
+                               PPCRH_Imm(True,toUShort(n)) ));
+}
+
+/*
+  returns a quadword aligned address on the stack
+   - copies SP, adds 16bytes, aligns to quadword.
+  use sub_from_sp(32) before calling this,
+  as expects to have 32 bytes to play with.
+*/
+static HReg get_sp_aligned16 ( ISelEnv* env )
+{
+   HReg       r = newVRegI(env);
+   HReg align16 = newVRegI(env);
+   addInstr(env, mk_iMOVds_RR(r, StackFramePtr(env->mode64)));
+   // add 16
+   addInstr(env, PPCInstr_Alu( Palu_ADD, r, r,
+                               PPCRH_Imm(True,toUShort(16)) ));
+   // mask to quadword
+   addInstr(env,
+            PPCInstr_LI(align16, 0xFFFFFFFFFFFFFFF0ULL, env->mode64));
+   addInstr(env, PPCInstr_Alu(Palu_AND, r,r, PPCRH_Reg(align16)));
+   return r;
+}
+
+
+
+/* Load 2*I32 regs to fp reg */
+static HReg mk_LoadRR32toFPR ( ISelEnv* env,
+                               HReg r_srcHi, HReg r_srcLo )
+{
+   HReg fr_dst = newVRegF(env);
+   PPCAMode *am_addr0, *am_addr1;
+
+   vassert(!env->mode64);
+   vassert(hregClass(r_srcHi) == HRcInt32);
+   vassert(hregClass(r_srcLo) == HRcInt32);
+
+   sub_from_sp( env, 16 );        // Move SP down 16 bytes
+   am_addr0 = PPCAMode_IR( 0, StackFramePtr(env->mode64) );
+   am_addr1 = PPCAMode_IR( 4, StackFramePtr(env->mode64) );
+
+   // store hi,lo as Ity_I32's
+   addInstr(env, PPCInstr_Store( 4, am_addr0, r_srcHi, env->mode64 ));
+   addInstr(env, PPCInstr_Store( 4, am_addr1, r_srcLo, env->mode64 ));
+
+   // load as float
+   addInstr(env, PPCInstr_FpLdSt(True/*load*/, 8, fr_dst, am_addr0));
+   
+   add_to_sp( env, 16 );          // Reset SP
+   return fr_dst;
+}
+
+/* Load I64 reg to fp reg */
+static HReg mk_LoadR64toFPR ( ISelEnv* env, HReg r_src )
+{
+   HReg fr_dst = newVRegF(env);
+   PPCAMode *am_addr0;
+
+   vassert(env->mode64);
+   vassert(hregClass(r_src) == HRcInt64);
+
+   sub_from_sp( env, 16 );        // Move SP down 16 bytes
+   am_addr0 = PPCAMode_IR( 0, StackFramePtr(env->mode64) );
+
+   // store as Ity_I64
+   addInstr(env, PPCInstr_Store( 8, am_addr0, r_src, env->mode64 ));
+
+   // load as float
+   addInstr(env, PPCInstr_FpLdSt(True/*load*/, 8, fr_dst, am_addr0));
+   
+   add_to_sp( env, 16 );          // Reset SP
+   return fr_dst;
+}
+
+
+/* Given an amode, return one which references 4 bytes further
+   along. */
+
+static PPCAMode* advance4 ( ISelEnv* env, PPCAMode* am )
+{
+   PPCAMode* am4 = dopyPPCAMode( am );
+   if (am4->tag == Pam_IR 
+       && am4->Pam.IR.index + 4 <= 32767) {
+      am4->Pam.IR.index += 4;
+   } else {
+      vpanic("advance4(ppc,host)");
+   }
+   return am4;
+}
+
+
+/* Given a guest-state array descriptor, an index expression and a
+   bias, generate a PPCAMode pointing at the relevant piece of 
+   guest state.  */
+static
+PPCAMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr,
+                                IRExpr* off, Int bias )
+{
+   HReg rtmp, roff;
+   Int  elemSz = sizeofIRType(descr->elemTy);
+   Int  nElems = descr->nElems;
+   Int  shift  = 0;
+
+   /* Throw out any cases we don't need.  In theory there might be a
+      day where we need to handle others, but not today. */
+
+   if (nElems != 16 && nElems != 32)
+      vpanic("genGuestArrayOffset(ppc host)(1)");
+
+   switch (elemSz) {
+      case 4:  shift = 2; break;
+      case 8:  shift = 3; break;
+      default: vpanic("genGuestArrayOffset(ppc host)(2)");
+   }
+
+   if (bias < -100 || bias > 100) /* somewhat arbitrarily */
+      vpanic("genGuestArrayOffset(ppc host)(3)");
+   if (descr->base < 0 || descr->base > 4000) /* somewhat arbitrarily */
+      vpanic("genGuestArrayOffset(ppc host)(4)");
+
+   /* Compute off into a reg, %off.  Then return:
+
+         addi %tmp, %off, bias (if bias != 0)
+         andi %tmp, nElems-1
+         sldi %tmp, shift
+         addi %tmp, %tmp, base
+         ... Baseblockptr + %tmp ...
+   */
+   roff = iselWordExpr_R(env, off);
+   rtmp = newVRegI(env);
+   addInstr(env, PPCInstr_Alu(
+                    Palu_ADD, 
+                    rtmp, roff, 
+                    PPCRH_Imm(True/*signed*/, toUShort(bias))));
+   addInstr(env, PPCInstr_Alu(
+                    Palu_AND, 
+                    rtmp, rtmp, 
+                    PPCRH_Imm(False/*unsigned*/, toUShort(nElems-1))));
+   addInstr(env, PPCInstr_Shft(
+                    Pshft_SHL, 
+                    env->mode64 ? False : True/*F:64-bit, T:32-bit shift*/,
+                    rtmp, rtmp, 
+                    PPCRH_Imm(False/*unsigned*/, toUShort(shift))));
+   addInstr(env, PPCInstr_Alu(
+                    Palu_ADD, 
+                    rtmp, rtmp, 
+                    PPCRH_Imm(True/*signed*/, toUShort(descr->base))));
+   return
+      PPCAMode_RR( GuestStatePtr(env->mode64), rtmp );
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Function call helpers                       ---*/
+/*---------------------------------------------------------*/
+
+/* Used only in doHelperCall.  See big comment in doHelperCall re
+   handling of register-parameter args.  This function figures out
+   whether evaluation of an expression might require use of a fixed
+   register.  If in doubt return True (safe but suboptimal).
+*/
+static
+Bool mightRequireFixedRegs ( IRExpr* e )
+{
+   switch (e->tag) {
+   case Iex_RdTmp: case Iex_Const: case Iex_Get: 
+      return False;
+   default:
+      return True;
+   }
+}
+
+
+/* Do a complete function call.  guard is a Ity_Bit expression
+   indicating whether or not the call happens.  If guard==NULL, the
+   call is unconditional. */
+
+static
+void doHelperCall ( ISelEnv* env, 
+                    Bool passBBP, 
+                    IRExpr* guard, IRCallee* cee, IRExpr** args )
+{
+   PPCCondCode cc;
+   HReg        argregs[PPC_N_REGPARMS];
+   HReg        tmpregs[PPC_N_REGPARMS];
+   Bool        go_fast;
+   Int         n_args, i, argreg;
+   UInt        argiregs;
+   ULong       target;
+   Bool        mode64 = env->mode64;
+
+   /* Do we need to force use of an odd-even reg pair for 64-bit
+      args? */
+   Bool regalign_int64s
+      = (!mode64) && env->vbi->host_ppc32_regalign_int64_args;
+
+   /* Marshal args for a call and do the call.
+
+      If passBBP is True, %rbp (the baseblock pointer) is to be passed
+      as the first arg.
+
+      This function only deals with a tiny set of possibilities, which
+      cover all helpers in practice.  The restrictions are that only
+      arguments in registers are supported, hence only PPC_N_REGPARMS x
+      (mode32:32 | mode64:64) integer bits in total can be passed.
+      In fact the only supported arg type is (mode32:I32 | mode64:I64).
+
+      Generating code which is both efficient and correct when
+      parameters are to be passed in registers is difficult, for the
+      reasons elaborated in detail in comments attached to
+      doHelperCall() in priv/host-x86/isel.c.  Here, we use a variant
+      of the method described in those comments.
+
+      The problem is split into two cases: the fast scheme and the
+      slow scheme.  In the fast scheme, arguments are computed
+      directly into the target (real) registers.  This is only safe
+      when we can be sure that computation of each argument will not
+      trash any real registers set by computation of any other
+      argument.
+
+      In the slow scheme, all args are first computed into vregs, and
+      once they are all done, they are moved to the relevant real
+      regs.  This always gives correct code, but it also gives a bunch
+      of vreg-to-rreg moves which are usually redundant but are hard
+      for the register allocator to get rid of.
+
+      To decide which scheme to use, all argument expressions are
+      first examined.  If they are all so simple that it is clear they
+      will be evaluated without use of any fixed registers, use the
+      fast scheme, else use the slow scheme.  Note also that only
+      unconditional calls may use the fast scheme, since having to
+      compute a condition expression could itself trash real
+      registers.
+
+      Note this requires being able to examine an expression and
+      determine whether or not evaluation of it might use a fixed
+      register.  That requires knowledge of how the rest of this insn
+      selector works.  Currently just the following 3 are regarded as
+      safe -- hopefully they cover the majority of arguments in
+      practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
+   */
+
+   /* Note that the cee->regparms field is meaningless on PPC32/64 host
+      (since there is only one calling convention) and so we always
+      ignore it. */
+
+   n_args = 0;
+   for (i = 0; args[i]; i++)
+      n_args++;
+
+   if (PPC_N_REGPARMS < n_args + (passBBP ? 1 : 0)) {
+      vpanic("doHelperCall(PPC): cannot currently handle > 8 args");
+      // PPC_N_REGPARMS
+   }
+   
+   argregs[0] = hregPPC_GPR3(mode64);
+   argregs[1] = hregPPC_GPR4(mode64);
+   argregs[2] = hregPPC_GPR5(mode64);
+   argregs[3] = hregPPC_GPR6(mode64);
+   argregs[4] = hregPPC_GPR7(mode64);
+   argregs[5] = hregPPC_GPR8(mode64);
+   argregs[6] = hregPPC_GPR9(mode64);
+   argregs[7] = hregPPC_GPR10(mode64);
+   argiregs = 0;
+
+   tmpregs[0] = tmpregs[1] = tmpregs[2] =
+   tmpregs[3] = tmpregs[4] = tmpregs[5] =
+   tmpregs[6] = tmpregs[7] = INVALID_HREG;
+
+   /* First decide which scheme (slow or fast) is to be used.  First
+      assume the fast scheme, and select slow if any contraindications
+      (wow) appear. */
+
+   go_fast = True;
+
+   if (guard) {
+      if (guard->tag == Iex_Const 
+          && guard->Iex.Const.con->tag == Ico_U1
+          && guard->Iex.Const.con->Ico.U1 == True) {
+         /* unconditional */
+      } else {
+         /* Not manifestly unconditional -- be conservative. */
+         go_fast = False;
+      }
+   }
+
+   if (go_fast) {
+      for (i = 0; i < n_args; i++) {
+         if (mightRequireFixedRegs(args[i])) {
+            go_fast = False;
+            break;
+         }
+      }
+   }
+
+   /* At this point the scheme to use has been established.  Generate
+      code to get the arg values into the argument rregs. */
+
+   if (go_fast) {
+
+      /* FAST SCHEME */
+      argreg = 0;
+      if (passBBP) {
+         argiregs |= (1 << (argreg+3));
+         addInstr(env, mk_iMOVds_RR( argregs[argreg],
+                                     GuestStatePtr(mode64) ));
+         argreg++;
+      }
+
+      for (i = 0; i < n_args; i++) {
+         vassert(argreg < PPC_N_REGPARMS);
+         vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32 ||
+                 typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
+         if (!mode64) {
+            if (typeOfIRExpr(env->type_env, args[i]) == Ity_I32) { 
+               argiregs |= (1 << (argreg+3));
+               addInstr(env,
+                        mk_iMOVds_RR( argregs[argreg],
+                                      iselWordExpr_R(env, args[i]) ));
+            } else { // Ity_I64
+               HReg rHi, rLo;
+               if (regalign_int64s && (argreg%2) == 1) 
+                              // ppc32 ELF abi spec for passing LONG_LONG
+                  argreg++;   // XXX: odd argreg => even rN
+               vassert(argreg < PPC_N_REGPARMS-1);
+               iselInt64Expr(&rHi,&rLo, env, args[i]);
+               argiregs |= (1 << (argreg+3));
+               addInstr(env, mk_iMOVds_RR( argregs[argreg++], rHi ));
+               argiregs |= (1 << (argreg+3));
+               addInstr(env, mk_iMOVds_RR( argregs[argreg], rLo));
+            }
+         } else { // mode64
+            argiregs |= (1 << (argreg+3));
+            addInstr(env, mk_iMOVds_RR( argregs[argreg],
+                                        iselWordExpr_R(env, args[i]) ));
+         }
+         argreg++;
+      }
+
+      /* Fast scheme only applies for unconditional calls.  Hence: */
+      cc.test = Pct_ALWAYS;
+
+   } else {
+
+      /* SLOW SCHEME; move via temporaries */
+      argreg = 0;
+
+      if (passBBP) {
+         /* This is pretty stupid; better to move directly to r3
+            after the rest of the args are done. */
+         tmpregs[argreg] = newVRegI(env);
+         addInstr(env, mk_iMOVds_RR( tmpregs[argreg],
+                                     GuestStatePtr(mode64) ));
+         argreg++;
+      }
+
+      for (i = 0; i < n_args; i++) {
+         vassert(argreg < PPC_N_REGPARMS);
+         vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32 ||
+                 typeOfIRExpr(env->type_env, args[i]) == Ity_I64);
+         if (!mode64) {
+            if (typeOfIRExpr(env->type_env, args[i]) == Ity_I32) { 
+               tmpregs[argreg] = iselWordExpr_R(env, args[i]);
+            } else { // Ity_I64
+               HReg rHi, rLo;
+               if (regalign_int64s && (argreg%2) == 1)
+                             // ppc32 ELF abi spec for passing LONG_LONG
+                  argreg++;  // XXX: odd argreg => even rN
+               vassert(argreg < PPC_N_REGPARMS-1);
+               iselInt64Expr(&rHi,&rLo, env, args[i]);
+               tmpregs[argreg++] = rHi;
+               tmpregs[argreg]   = rLo;
+            }
+         } else { // mode64
+            tmpregs[argreg] = iselWordExpr_R(env, args[i]);
+         }
+         argreg++;
+      }
+
+      /* Now we can compute the condition.  We can't do it earlier
+         because the argument computations could trash the condition
+         codes.  Be a bit clever to handle the common case where the
+         guard is 1:Bit. */
+      cc.test = Pct_ALWAYS;
+      if (guard) {
+         if (guard->tag == Iex_Const 
+             && guard->Iex.Const.con->tag == Ico_U1
+             && guard->Iex.Const.con->Ico.U1 == True) {
+            /* unconditional -- do nothing */
+         } else {
+            cc = iselCondCode( env, guard );
+         }
+      }
+
+      /* Move the args to their final destinations. */
+      for (i = 0; i < argreg; i++) {
+         if (tmpregs[i] == INVALID_HREG)  // Skip invalid regs
+            continue;
+         /* None of these insns, including any spill code that might
+            be generated, may alter the condition codes. */
+         argiregs |= (1 << (i+3));
+         addInstr( env, mk_iMOVds_RR( argregs[i], tmpregs[i] ) );
+      }
+
+   }
+
+   target = mode64 ? Ptr_to_ULong(cee->addr) :
+                     toUInt(Ptr_to_ULong(cee->addr));
+
+   /* Finally, the call itself. */
+   addInstr(env, PPCInstr_Call( cc, (Addr64)target, argiregs ));
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: FP rounding mode helpers                    ---*/
+/*---------------------------------------------------------*/
+
+///* Set FPU's rounding mode to the default */
+//static 
+//void set_FPU_rounding_default ( ISelEnv* env )
+//{
+//   HReg fr_src = newVRegF(env);
+//   HReg r_src  = newVRegI(env);
+//
+//   /* Default rounding mode = 0x0
+//      Only supporting the rounding-mode bits - the rest of FPSCR is 0x0
+//       - so we can set the whole register at once (faster)
+//      note: upper 32 bits ignored by FpLdFPSCR
+//   */
+//   addInstr(env, PPCInstr_LI(r_src, 0x0, env->mode64));
+//   if (env->mode64) {
+//      fr_src = mk_LoadR64toFPR( env, r_src );         // 1*I64 -> F64
+//   } else {
+//      fr_src = mk_LoadRR32toFPR( env, r_src, r_src ); // 2*I32 -> F64
+//   }
+//   addInstr(env, PPCInstr_FpLdFPSCR( fr_src ));
+//}
+
+/* Convert IR rounding mode to PPC encoding */
+static HReg roundModeIRtoPPC ( ISelEnv* env, HReg r_rmIR )
+{
+   /* 
+   rounding mode | PPC | IR
+   ------------------------
+   to nearest    | 00  | 00
+   to zero       | 01  | 11
+   to +infinity  | 10  | 10
+   to -infinity  | 11  | 01
+   */
+   HReg r_rmPPC = newVRegI(env);
+   HReg r_tmp1  = newVRegI(env);
+
+   vassert(hregClass(r_rmIR) == HRcGPR(env->mode64));
+
+   // r_rmPPC = XOR(r_rmIR, r_rmIR << 1) & 3
+   //
+   // slwi  tmp1,    r_rmIR, 1
+   // xor   tmp1,    r_rmIR, tmp1
+   // andi  r_rmPPC, tmp1, 3
+
+   addInstr(env, PPCInstr_Shft(Pshft_SHL, True/*32bit shift*/,
+                               r_tmp1, r_rmIR, PPCRH_Imm(False,1)));
+
+   addInstr(env, PPCInstr_Alu( Palu_XOR, r_tmp1, r_rmIR,
+                               PPCRH_Reg(r_tmp1) ));
+
+   addInstr(env, PPCInstr_Alu( Palu_AND, r_rmPPC, r_tmp1,
+                               PPCRH_Imm(False,3) ));
+
+   return r_rmPPC;
+}
+
+
+/* Set the FPU's rounding mode: 'mode' is an I32-typed expression
+   denoting a value in the range 0 .. 3, indicating a round mode
+   encoded as per type IRRoundingMode.  Set the PPC FPSCR to have the
+   same rounding.
+
+   For speed & simplicity, we're setting the *entire* FPSCR here.
+
+   Setting the rounding mode is expensive.  So this function tries to
+   avoid repeatedly setting the rounding mode to the same thing by
+   first comparing 'mode' to the 'mode' tree supplied in the previous
+   call to this function, if any.  (The previous value is stored in
+   env->previous_rm.)  If 'mode' is a single IR temporary 't' and
+   env->previous_rm is also just 't', then the setting is skipped.
+
+   This is safe because of the SSA property of IR: an IR temporary can
+   only be defined once and so will have the same value regardless of
+   where it appears in the block.  Cool stuff, SSA.
+
+   A safety condition: all attempts to set the RM must be aware of
+   this mechanism - by being routed through the functions here.
+
+   Of course this only helps if blocks where the RM is set more than
+   once and it is set to the same value each time, *and* that value is
+   held in the same IR temporary each time.  In order to assure the
+   latter as much as possible, the IR optimiser takes care to do CSE
+   on any block with any sign of floating point activity.
+*/
+static
+void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
+{
+   HReg fr_src = newVRegF(env);
+   HReg r_src;
+
+   vassert(typeOfIRExpr(env->type_env,mode) == Ity_I32);
+   
+   /* Do we need to do anything? */
+   if (env->previous_rm
+       && env->previous_rm->tag == Iex_RdTmp
+       && mode->tag == Iex_RdTmp
+       && env->previous_rm->Iex.RdTmp.tmp == mode->Iex.RdTmp.tmp) {
+      /* no - setting it to what it was before.  */
+      vassert(typeOfIRExpr(env->type_env, env->previous_rm) == Ity_I32);
+      return;
+   }
+
+   /* No luck - we better set it, and remember what we set it to. */
+   env->previous_rm = mode;
+
+   /* Only supporting the rounding-mode bits - the rest of FPSCR is
+      0x0 - so we can set the whole register at once (faster). */
+
+   // Resolve rounding mode and convert to PPC representation
+   r_src = roundModeIRtoPPC( env, iselWordExpr_R(env, mode) );
+   // gpr -> fpr
+   if (env->mode64) {
+      fr_src = mk_LoadR64toFPR( env, r_src );         // 1*I64 -> F64
+   } else {
+      fr_src = mk_LoadRR32toFPR( env, r_src, r_src ); // 2*I32 -> F64
+   }
+
+   // Move to FPSCR
+   addInstr(env, PPCInstr_FpLdFPSCR( fr_src ));
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: vector helpers                              ---*/
+/*---------------------------------------------------------*/
+
+/* Generate all-zeroes into a new vector register.
+*/
+static HReg generate_zeroes_V128 ( ISelEnv* env )
+{
+   HReg dst = newVRegV(env);
+   addInstr(env, PPCInstr_AvBinary(Pav_XOR, dst, dst, dst));
+   return dst;
+}
+
+
+/*
+  Generates code for AvSplat
+  - takes in IRExpr* of type 8|16|32
+    returns vector reg of duplicated lanes of input
+  - uses AvSplat(imm) for imms up to simm6.
+    otherwise must use store reg & load vector
+*/
+static HReg mk_AvDuplicateRI( ISelEnv* env, IRExpr* e )
+{
+   HReg   r_src;
+   HReg   dst = newVRegV(env);
+   PPCRI* ri  = iselWordExpr_RI(env, e);
+   IRType ty  = typeOfIRExpr(env->type_env,e);
+   UInt   sz  = (ty == Ity_I8) ? 8 : (ty == Ity_I16) ? 16 : 32;
+   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
+
+   /* special case: immediate */
+   if (ri->tag == Pri_Imm) {
+      Int simm32 = (Int)ri->Pri.Imm;
+
+      /* figure out if it's do-able with imm splats. */
+      if (simm32 >= -32 && simm32 <= 31) {
+         Char simm6 = (Char)simm32;
+         if (simm6 > 15) {           /* 16:31 inclusive */
+            HReg v1 = newVRegV(env);
+            HReg v2 = newVRegV(env);
+            addInstr(env, PPCInstr_AvSplat(sz, v1, PPCVI5s_Imm(-16)));
+            addInstr(env, PPCInstr_AvSplat(sz, v2, PPCVI5s_Imm(simm6-16)));
+            addInstr(env,
+               (sz== 8) ? PPCInstr_AvBin8x16(Pav_SUBU, dst, v2, v1) :
+               (sz==16) ? PPCInstr_AvBin16x8(Pav_SUBU, dst, v2, v1)
+                        : PPCInstr_AvBin32x4(Pav_SUBU, dst, v2, v1) );
+            return dst;
+         }
+         if (simm6 < -16) {          /* -32:-17 inclusive */
+            HReg v1 = newVRegV(env);
+            HReg v2 = newVRegV(env);
+            addInstr(env, PPCInstr_AvSplat(sz, v1, PPCVI5s_Imm(-16)));
+            addInstr(env, PPCInstr_AvSplat(sz, v2, PPCVI5s_Imm(simm6+16)));
+            addInstr(env,
+               (sz== 8) ? PPCInstr_AvBin8x16(Pav_ADDU, dst, v2, v1) :
+               (sz==16) ? PPCInstr_AvBin16x8(Pav_ADDU, dst, v2, v1)
+                        : PPCInstr_AvBin32x4(Pav_ADDU, dst, v2, v1) );
+            return dst;
+         }
+         /* simplest form:              -16:15 inclusive */
+         addInstr(env, PPCInstr_AvSplat(sz, dst, PPCVI5s_Imm(simm6)));
+         return dst;
+      }
+
+      /* no luck; use the Slow way. */
+      r_src = newVRegI(env);
+      addInstr(env, PPCInstr_LI(r_src, (Long)simm32, env->mode64));
+   }
+   else {
+      r_src = ri->Pri.Reg;
+   }
+
+   /* default case: store r_src in lowest lane of 16-aligned mem,
+      load vector, splat lowest lane to dst */
+   {
+      /* CAB: Maybe faster to store r_src multiple times (sz dependent),
+              and simply load the vector? */
+      HReg r_aligned16;
+      HReg v_src = newVRegV(env);
+      PPCAMode *am_off12;
+
+      sub_from_sp( env, 32 );     // Move SP down
+      /* Get a 16-aligned address within our stack space */
+      r_aligned16 = get_sp_aligned16( env );
+      am_off12 = PPCAMode_IR( 12, r_aligned16 );
+
+      /* Store r_src in low word of 16-aligned mem */
+      addInstr(env, PPCInstr_Store( 4, am_off12, r_src, env->mode64 ));
+
+      /* Load src to vector[low lane] */
+      addInstr(env, PPCInstr_AvLdSt( True/*ld*/, 4, v_src, am_off12 ) );
+      add_to_sp( env, 32 );       // Reset SP
+
+      /* Finally, splat v_src[low_lane] to dst */
+      addInstr(env, PPCInstr_AvSplat(sz, dst, PPCVI5s_Reg(v_src)));
+      return dst;
+   }
+}
+
+
+/* for each lane of vSrc: lane == nan ? laneX = all 1's : all 0's */
+static HReg isNan ( ISelEnv* env, HReg vSrc )
+{
+   HReg zeros, msk_exp, msk_mnt, expt, mnts, vIsNan;
+ 
+   vassert(hregClass(vSrc) == HRcVec128);
+
+   zeros   = mk_AvDuplicateRI(env, mkU32(0));
+   msk_exp = mk_AvDuplicateRI(env, mkU32(0x7F800000));
+   msk_mnt = mk_AvDuplicateRI(env, mkU32(0x7FFFFF));
+   expt    = newVRegV(env);
+   mnts    = newVRegV(env);
+   vIsNan  = newVRegV(env); 
+
+   /* 32bit float => sign(1) | exponent(8) | mantissa(23)
+      nan => exponent all ones, mantissa > 0 */
+
+   addInstr(env, PPCInstr_AvBinary(Pav_AND, expt, vSrc, msk_exp));
+   addInstr(env, PPCInstr_AvBin32x4(Pav_CMPEQU, expt, expt, msk_exp));
+   addInstr(env, PPCInstr_AvBinary(Pav_AND, mnts, vSrc, msk_mnt));
+   addInstr(env, PPCInstr_AvBin32x4(Pav_CMPGTU, mnts, mnts, zeros));
+   addInstr(env, PPCInstr_AvBinary(Pav_AND, vIsNan, expt, mnts));
+   return vIsNan;
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expressions (64/32/16/8 bit)        ---*/
+/*---------------------------------------------------------*/
+
+/* Select insns for an integer-typed expression, and add them to the
+   code list.  Return a reg holding the result.  This reg will be a
+   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
+   want to modify it, ask for a new vreg, copy it in there, and modify
+   the copy.  The register allocator will do its best to map both
+   vregs to the same real register, so the copies will often disappear
+   later in the game.
+
+   This should handle expressions of 64, 32, 16 and 8-bit type.
+   All results are returned in a (mode64 ? 64bit : 32bit) register.
+   For 16- and 8-bit expressions, the upper (32/48/56 : 16/24) bits
+   are arbitrary, so you should mask or sign extend partial values
+   if necessary.
+*/
+
+static HReg iselWordExpr_R ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselWordExpr_R_wrk(env, e);
+   /* sanity checks ... */
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+
+   vassert(hregClass(r) == HRcGPR(env->mode64));
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static HReg iselWordExpr_R_wrk ( ISelEnv* env, IRExpr* e )
+{
+   Bool mode64 = env->mode64;
+   MatchInfo mi;
+   DECLARE_PATTERN(p_32to1_then_1Uto8);
+
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I8 || ty == Ity_I16 ||
+           ty == Ity_I32 || ((ty == Ity_I64) && mode64));
+
+   switch (e->tag) {
+
+   /* --------- TEMP --------- */
+   case Iex_RdTmp:
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+
+   /* --------- LOAD --------- */
+   case Iex_Load: {
+      HReg      r_dst;
+      PPCAMode* am_addr;
+      if (e->Iex.Load.end != Iend_BE)
+         goto irreducible;
+      r_dst   = newVRegI(env);
+      am_addr = iselWordExpr_AMode( env, e->Iex.Load.addr, ty/*of xfer*/ );
+      addInstr(env, PPCInstr_Load( toUChar(sizeofIRType(ty)), 
+                                   r_dst, am_addr, mode64 ));
+      return r_dst;
+      /*NOTREACHED*/
+   }
+
+   /* --------- BINARY OP --------- */
+   case Iex_Binop: {
+      PPCAluOp  aluOp;
+      PPCShftOp shftOp;
+
+      /* Is it an addition or logical style op? */
+      switch (e->Iex.Binop.op) {
+      case Iop_Add8: case Iop_Add16: case Iop_Add32: case Iop_Add64:
+         aluOp = Palu_ADD; break;
+      case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: case Iop_Sub64:
+         aluOp = Palu_SUB; break;
+      case Iop_And8: case Iop_And16: case Iop_And32: case Iop_And64:
+         aluOp = Palu_AND; break;
+      case Iop_Or8:  case Iop_Or16:  case Iop_Or32:  case Iop_Or64:
+         aluOp = Palu_OR; break;
+      case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: case Iop_Xor64:
+         aluOp = Palu_XOR; break;
+      default:
+         aluOp = Palu_INVALID; break;
+      }
+      /* For commutative ops we assume any literal
+         values are on the second operand. */
+      if (aluOp != Palu_INVALID) {
+         HReg   r_dst   = newVRegI(env);
+         HReg   r_srcL  = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         PPCRH* ri_srcR = NULL;
+         /* get right arg into an RH, in the appropriate way */
+         switch (aluOp) {
+         case Palu_ADD: case Palu_SUB:
+            ri_srcR = iselWordExpr_RH(env, True/*signed*/, 
+                                      e->Iex.Binop.arg2);
+            break;
+         case Palu_AND: case Palu_OR: case Palu_XOR:
+            ri_srcR = iselWordExpr_RH(env, False/*signed*/,
+                                      e->Iex.Binop.arg2);
+            break;
+         default:
+            vpanic("iselWordExpr_R_wrk-aluOp-arg2");
+         }
+         addInstr(env, PPCInstr_Alu(aluOp, r_dst, r_srcL, ri_srcR));
+         return r_dst;
+      }
+
+      /* a shift? */
+      switch (e->Iex.Binop.op) {
+      case Iop_Shl8: case Iop_Shl16: case Iop_Shl32: case Iop_Shl64:
+         shftOp = Pshft_SHL; break;
+      case Iop_Shr8: case Iop_Shr16: case Iop_Shr32: case Iop_Shr64:
+         shftOp = Pshft_SHR; break;
+      case Iop_Sar8: case Iop_Sar16: case Iop_Sar32: case Iop_Sar64:
+         shftOp = Pshft_SAR; break;
+      default:
+         shftOp = Pshft_INVALID; break;
+      }
+      /* we assume any literal values are on the second operand. */
+      if (shftOp != Pshft_INVALID) {
+         HReg   r_dst   = newVRegI(env);
+         HReg   r_srcL  = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         PPCRH* ri_srcR = NULL;
+         /* get right arg into an RH, in the appropriate way */
+         switch (shftOp) {
+         case Pshft_SHL: case Pshft_SHR: case Pshft_SAR:
+            if (!mode64)
+               ri_srcR = iselWordExpr_RH5u(env, e->Iex.Binop.arg2);
+            else
+               ri_srcR = iselWordExpr_RH6u(env, e->Iex.Binop.arg2);
+            break;
+         default:
+            vpanic("iselIntExpr_R_wrk-shftOp-arg2");
+         }
+         /* widen the left arg if needed */
+         if (shftOp == Pshft_SHR || shftOp == Pshft_SAR) {
+            if (ty == Ity_I8 || ty == Ity_I16) {
+               PPCRH* amt = PPCRH_Imm(False,
+                                      toUShort(ty == Ity_I8 ? 24 : 16));
+               HReg   tmp = newVRegI(env);
+               addInstr(env, PPCInstr_Shft(Pshft_SHL,
+                                           True/*32bit shift*/,
+                                           tmp, r_srcL, amt));
+               addInstr(env, PPCInstr_Shft(shftOp,
+                                           True/*32bit shift*/,
+                                           tmp, tmp,    amt));
+               r_srcL = tmp;
+               vassert(0); /* AWAITING TEST CASE */
+            }
+         }
+         /* Only 64 expressions need 64bit shifts,
+            32bit shifts are fine for all others */
+         if (ty == Ity_I64) {
+            vassert(mode64);
+            addInstr(env, PPCInstr_Shft(shftOp, False/*64bit shift*/,
+                                        r_dst, r_srcL, ri_srcR));
+         } else {
+            addInstr(env, PPCInstr_Shft(shftOp, True/*32bit shift*/,
+                                        r_dst, r_srcL, ri_srcR));
+         }
+         return r_dst;
+      }
+
+      /* How about a div? */
+      if (e->Iex.Binop.op == Iop_DivS32 || 
+          e->Iex.Binop.op == Iop_DivU32) {
+         Bool syned  = toBool(e->Iex.Binop.op == Iop_DivS32);
+         HReg r_dst  = newVRegI(env);
+         HReg r_srcL = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         HReg r_srcR = iselWordExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, PPCInstr_Div(syned, True/*32bit div*/,
+                                    r_dst, r_srcL, r_srcR));
+         return r_dst;
+      }
+      if (e->Iex.Binop.op == Iop_DivS64 || 
+          e->Iex.Binop.op == Iop_DivU64) {
+         Bool syned  = toBool(e->Iex.Binop.op == Iop_DivS64);
+         HReg r_dst  = newVRegI(env);
+         HReg r_srcL = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         HReg r_srcR = iselWordExpr_R(env, e->Iex.Binop.arg2);
+         vassert(mode64);
+         addInstr(env, PPCInstr_Div(syned, False/*64bit div*/,
+                                    r_dst, r_srcL, r_srcR));
+         return r_dst;
+      }
+
+      /* No? Anyone for a mul? */
+      if (e->Iex.Binop.op == Iop_Mul32
+          || e->Iex.Binop.op == Iop_Mul64) {
+         Bool syned       = False;
+         Bool sz32        = (e->Iex.Binop.op != Iop_Mul64);
+         HReg r_dst       = newVRegI(env);
+         HReg r_srcL      = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         HReg r_srcR      = iselWordExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, PPCInstr_MulL(syned, False/*lo32*/, sz32,
+                                     r_dst, r_srcL, r_srcR));
+         return r_dst;
+      }      
+
+      /* 32 x 32 -> 64 multiply */
+      if (mode64
+          && (e->Iex.Binop.op == Iop_MullU32
+              || e->Iex.Binop.op == Iop_MullS32)) {
+         HReg tLo    = newVRegI(env);
+         HReg tHi    = newVRegI(env);
+         HReg r_dst  = newVRegI(env);
+         Bool syned  = toBool(e->Iex.Binop.op == Iop_MullS32);
+         HReg r_srcL = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         HReg r_srcR = iselWordExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, PPCInstr_MulL(False/*signedness irrelevant*/, 
+                                     False/*lo32*/, True/*32bit mul*/,
+                                     tLo, r_srcL, r_srcR));
+         addInstr(env, PPCInstr_MulL(syned,
+                                     True/*hi32*/, True/*32bit mul*/,
+                                     tHi, r_srcL, r_srcR));
+         addInstr(env, PPCInstr_Shft(Pshft_SHL, False/*64bit shift*/,
+                                     r_dst, tHi, PPCRH_Imm(False,32)));
+         addInstr(env, PPCInstr_Alu(Palu_OR,
+                                    r_dst, r_dst, PPCRH_Reg(tLo)));
+         return r_dst;
+      }
+
+      /* El-mutanto 3-way compare? */
+      if (e->Iex.Binop.op == Iop_CmpORD32S
+          || e->Iex.Binop.op == Iop_CmpORD32U) {
+         Bool   syned = toBool(e->Iex.Binop.op == Iop_CmpORD32S);
+         HReg   dst   = newVRegI(env);
+         HReg   srcL  = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         PPCRH* srcR  = iselWordExpr_RH(env, syned, e->Iex.Binop.arg2);
+         addInstr(env, PPCInstr_Cmp(syned, True/*32bit cmp*/,
+                                    7/*cr*/, srcL, srcR));
+         addInstr(env, PPCInstr_MfCR(dst));
+         addInstr(env, PPCInstr_Alu(Palu_AND, dst, dst,
+                                    PPCRH_Imm(False,7<<1)));
+         return dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_CmpORD64S
+          || e->Iex.Binop.op == Iop_CmpORD64U) {
+         Bool   syned = toBool(e->Iex.Binop.op == Iop_CmpORD64S);
+         HReg   dst   = newVRegI(env);
+         HReg   srcL  = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         PPCRH* srcR  = iselWordExpr_RH(env, syned, e->Iex.Binop.arg2);
+         vassert(mode64);
+         addInstr(env, PPCInstr_Cmp(syned, False/*64bit cmp*/,
+                                    7/*cr*/, srcL, srcR));
+         addInstr(env, PPCInstr_MfCR(dst));
+         addInstr(env, PPCInstr_Alu(Palu_AND, dst, dst,
+                                    PPCRH_Imm(False,7<<1)));
+         return dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_Max32U) {
+         HReg        r1   = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         HReg        r2   = iselWordExpr_R(env, e->Iex.Binop.arg2);
+         HReg        rdst = newVRegI(env);
+         PPCCondCode cc   = mk_PPCCondCode( Pct_TRUE, Pcf_7LT );
+         addInstr(env, mk_iMOVds_RR(rdst, r1));
+         addInstr(env, PPCInstr_Cmp(False/*unsigned*/, True/*32bit cmp*/,
+                                    7/*cr*/, rdst, PPCRH_Reg(r2)));
+         addInstr(env, PPCInstr_CMov(cc, rdst, PPCRI_Reg(r2)));
+         return rdst;
+      }
+
+      if (e->Iex.Binop.op == Iop_32HLto64) {
+         HReg   r_Hi  = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         HReg   r_Lo  = iselWordExpr_R(env, e->Iex.Binop.arg2);
+         HReg   r_dst = newVRegI(env);
+         HReg   msk   = newVRegI(env);
+         vassert(mode64);
+         /* r_dst = OR( r_Hi<<32, r_Lo ) */
+         addInstr(env, PPCInstr_Shft(Pshft_SHL, False/*64bit shift*/,
+                                     r_dst, r_Hi, PPCRH_Imm(False,32)));
+         addInstr(env, PPCInstr_LI(msk, 0xFFFFFFFF, mode64));
+         addInstr(env, PPCInstr_Alu( Palu_AND, r_Lo, r_Lo,
+                                     PPCRH_Reg(msk) ));
+         addInstr(env, PPCInstr_Alu( Palu_OR, r_dst, r_dst,
+                                     PPCRH_Reg(r_Lo) ));
+         return r_dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_CmpF64) {
+         HReg fr_srcL    = iselDblExpr(env, e->Iex.Binop.arg1);
+         HReg fr_srcR    = iselDblExpr(env, e->Iex.Binop.arg2);
+
+         HReg r_ccPPC   = newVRegI(env);
+         HReg r_ccIR    = newVRegI(env);
+         HReg r_ccIR_b0 = newVRegI(env);
+         HReg r_ccIR_b2 = newVRegI(env);
+         HReg r_ccIR_b6 = newVRegI(env);
+
+         addInstr(env, PPCInstr_FpCmp(r_ccPPC, fr_srcL, fr_srcR));
+
+         /* Map compare result from PPC to IR,
+            conforming to CmpF64 definition. */
+         /*
+           FP cmp result | PPC | IR
+           --------------------------
+           UN            | 0x1 | 0x45
+           EQ            | 0x2 | 0x40
+           GT            | 0x4 | 0x00
+           LT            | 0x8 | 0x01
+         */
+
+         // r_ccIR_b0 = r_ccPPC[0] | r_ccPPC[3]
+         addInstr(env, PPCInstr_Shft(Pshft_SHR, True/*32bit shift*/,
+                                     r_ccIR_b0, r_ccPPC,
+                                     PPCRH_Imm(False,0x3)));
+         addInstr(env, PPCInstr_Alu(Palu_OR,  r_ccIR_b0,
+                                    r_ccPPC,   PPCRH_Reg(r_ccIR_b0)));
+         addInstr(env, PPCInstr_Alu(Palu_AND, r_ccIR_b0,
+                                    r_ccIR_b0, PPCRH_Imm(False,0x1)));
+         
+         // r_ccIR_b2 = r_ccPPC[0]
+         addInstr(env, PPCInstr_Shft(Pshft_SHL, True/*32bit shift*/,
+                                     r_ccIR_b2, r_ccPPC,
+                                     PPCRH_Imm(False,0x2)));
+         addInstr(env, PPCInstr_Alu(Palu_AND, r_ccIR_b2,
+                                    r_ccIR_b2, PPCRH_Imm(False,0x4)));
+
+         // r_ccIR_b6 = r_ccPPC[0] | r_ccPPC[1]
+         addInstr(env, PPCInstr_Shft(Pshft_SHR, True/*32bit shift*/,
+                                     r_ccIR_b6, r_ccPPC,
+                                     PPCRH_Imm(False,0x1)));
+         addInstr(env, PPCInstr_Alu(Palu_OR,  r_ccIR_b6,
+                                    r_ccPPC, PPCRH_Reg(r_ccIR_b6)));
+         addInstr(env, PPCInstr_Shft(Pshft_SHL, True/*32bit shift*/,
+                                     r_ccIR_b6, r_ccIR_b6,
+                                     PPCRH_Imm(False,0x6)));
+         addInstr(env, PPCInstr_Alu(Palu_AND, r_ccIR_b6,
+                                    r_ccIR_b6, PPCRH_Imm(False,0x40)));
+
+         // r_ccIR = r_ccIR_b0 | r_ccIR_b2 | r_ccIR_b6
+         addInstr(env, PPCInstr_Alu(Palu_OR, r_ccIR,
+                                    r_ccIR_b0, PPCRH_Reg(r_ccIR_b2)));
+         addInstr(env, PPCInstr_Alu(Palu_OR, r_ccIR,
+                                    r_ccIR,    PPCRH_Reg(r_ccIR_b6)));
+         return r_ccIR;
+      }
+
+      if (e->Iex.Binop.op == Iop_F64toI32S) {
+         /* This works in both mode64 and mode32. */
+         HReg      r1      = StackFramePtr(env->mode64);
+         PPCAMode* zero_r1 = PPCAMode_IR( 0, r1 );
+         HReg      fsrc    = iselDblExpr(env, e->Iex.Binop.arg2);
+         HReg      ftmp    = newVRegF(env);
+         HReg      idst    = newVRegI(env);
+
+         /* Set host rounding mode */
+         set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+
+         sub_from_sp( env, 16 );
+         addInstr(env, PPCInstr_FpCftI(False/*F->I*/, True/*int32*/, 
+                                       ftmp, fsrc));
+         addInstr(env, PPCInstr_FpSTFIW(r1, ftmp));
+         addInstr(env, PPCInstr_Load(4, idst, zero_r1, mode64));
+
+         /* in 64-bit mode we need to sign-widen idst. */
+         if (mode64)
+            addInstr(env, PPCInstr_Unary(Pun_EXTSW, idst, idst));
+
+         add_to_sp( env, 16 );
+
+         ///* Restore default FPU rounding. */
+         //set_FPU_rounding_default( env );
+         return idst;
+      }
+
+      if (e->Iex.Binop.op == Iop_F64toI64S) {
+         if (mode64) {
+            HReg      r1      = StackFramePtr(env->mode64);
+            PPCAMode* zero_r1 = PPCAMode_IR( 0, r1 );
+            HReg      fsrc    = iselDblExpr(env, e->Iex.Binop.arg2);
+            HReg      idst    = newVRegI(env);         
+            HReg      ftmp    = newVRegF(env);
+
+            /* Set host rounding mode */
+            set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+
+            sub_from_sp( env, 16 );
+            addInstr(env, PPCInstr_FpCftI(False/*F->I*/, False/*int64*/,
+                                          ftmp, fsrc));
+            addInstr(env, PPCInstr_FpLdSt(False/*store*/, 8, ftmp, zero_r1));
+            addInstr(env, PPCInstr_Load(8, idst, zero_r1, True/*mode64*/));
+            add_to_sp( env, 16 );
+
+            ///* Restore default FPU rounding. */
+            //set_FPU_rounding_default( env );
+            return idst;
+         }
+      }
+
+      break;
+   }
+
+   /* --------- UNARY OP --------- */
+   case Iex_Unop: {
+      IROp op_unop = e->Iex.Unop.op;
+
+      /* 1Uto8(32to1(expr32)) */
+      DEFINE_PATTERN(p_32to1_then_1Uto8,
+                     unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
+      if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
+         IRExpr* expr32 = mi.bindee[0];
+         HReg r_dst = newVRegI(env);
+         HReg r_src = iselWordExpr_R(env, expr32);
+         addInstr(env, PPCInstr_Alu(Palu_AND, r_dst,
+                                    r_src, PPCRH_Imm(False,1)));
+         return r_dst;
+      }
+
+      /* 16Uto32(LDbe:I16(expr32)) */
+      {
+         DECLARE_PATTERN(p_LDbe16_then_16Uto32);
+         DEFINE_PATTERN(p_LDbe16_then_16Uto32,
+                        unop(Iop_16Uto32,
+                             IRExpr_Load(Iend_BE,Ity_I16,bind(0))) );
+         if (matchIRExpr(&mi,p_LDbe16_then_16Uto32,e)) {
+            HReg r_dst = newVRegI(env);
+            PPCAMode* amode
+               = iselWordExpr_AMode( env, mi.bindee[0], Ity_I16/*xfer*/ );
+            addInstr(env, PPCInstr_Load(2,r_dst,amode, mode64));
+            return r_dst;
+         }
+      }
+
+      switch (op_unop) {
+      case Iop_8Uto16:
+      case Iop_8Uto32:
+      case Iop_8Uto64:
+      case Iop_16Uto32:
+      case Iop_16Uto64: {
+         HReg   r_dst = newVRegI(env);
+         HReg   r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+         UShort mask  = toUShort(op_unop==Iop_16Uto64 ? 0xFFFF :
+                                 op_unop==Iop_16Uto32 ? 0xFFFF : 0xFF);
+         addInstr(env, PPCInstr_Alu(Palu_AND,r_dst,r_src,
+                                    PPCRH_Imm(False,mask)));
+         return r_dst;
+      }
+      case Iop_32Uto64: {
+         HReg r_dst = newVRegI(env);
+         HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+         vassert(mode64);
+         addInstr(env,
+                  PPCInstr_Shft(Pshft_SHL, False/*64bit shift*/,
+                                r_dst, r_src, PPCRH_Imm(False,32)));
+         addInstr(env,
+                  PPCInstr_Shft(Pshft_SHR, False/*64bit shift*/,
+                                r_dst, r_dst, PPCRH_Imm(False,32)));
+         return r_dst;
+      }
+      case Iop_8Sto16:
+      case Iop_8Sto32:
+      case Iop_16Sto32: {
+         HReg   r_dst = newVRegI(env);
+         HReg   r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+         UShort amt   = toUShort(op_unop==Iop_16Sto32 ? 16 : 24);
+         addInstr(env,
+                  PPCInstr_Shft(Pshft_SHL, True/*32bit shift*/,
+                                r_dst, r_src, PPCRH_Imm(False,amt)));
+         addInstr(env,
+                  PPCInstr_Shft(Pshft_SAR, True/*32bit shift*/,
+                                r_dst, r_dst, PPCRH_Imm(False,amt)));
+         return r_dst;
+      }
+      case Iop_8Sto64:
+      case Iop_16Sto64: {
+         HReg   r_dst = newVRegI(env);
+         HReg   r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+         UShort amt   = toUShort(op_unop==Iop_8Sto64  ? 56 :
+                                 op_unop==Iop_16Sto64 ? 48 : 32);
+         vassert(mode64);
+         addInstr(env,
+                  PPCInstr_Shft(Pshft_SHL, False/*64bit shift*/,
+                                r_dst, r_src, PPCRH_Imm(False,amt)));
+         addInstr(env,
+                  PPCInstr_Shft(Pshft_SAR, False/*64bit shift*/,
+                                r_dst, r_dst, PPCRH_Imm(False,amt)));
+         return r_dst;
+      }
+      case Iop_32Sto64: {
+         HReg   r_dst = newVRegI(env);
+         HReg   r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+	 vassert(mode64);
+         /* According to the IBM docs, in 64 bit mode, srawi r,r,0
+            sign extends the lower 32 bits into the upper 32 bits. */
+         addInstr(env,
+                  PPCInstr_Shft(Pshft_SAR, True/*32bit shift*/,
+                                r_dst, r_src, PPCRH_Imm(False,0)));
+         return r_dst;
+      }
+      case Iop_Not8:
+      case Iop_Not16:
+      case Iop_Not32:
+      case Iop_Not64: {
+         HReg r_dst = newVRegI(env);
+         HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+         addInstr(env, PPCInstr_Unary(Pun_NOT,r_dst,r_src));
+         return r_dst;
+      }
+      case Iop_64HIto32: {
+         if (!mode64) {
+            HReg rHi, rLo;
+            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+            return rHi; /* and abandon rLo .. poor wee thing :-) */
+         } else {
+            HReg   r_dst = newVRegI(env);
+            HReg   r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env,
+                     PPCInstr_Shft(Pshft_SHR, False/*64bit shift*/,
+                                   r_dst, r_src, PPCRH_Imm(False,32)));
+            return r_dst;
+         }
+      }
+      case Iop_64to32: {
+         if (!mode64) {
+            HReg rHi, rLo;
+            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+            return rLo; /* similar stupid comment to the above ... */
+         } else {
+            /* This is a no-op. */
+            return iselWordExpr_R(env, e->Iex.Unop.arg);
+         }
+      }
+      case Iop_64to16: {
+         if (mode64) { /* This is a no-op. */
+            return iselWordExpr_R(env, e->Iex.Unop.arg);
+         }
+         break; /* evidently not used in 32-bit mode */
+      }
+      case Iop_16HIto8:
+      case Iop_32HIto16: {
+         HReg   r_dst = newVRegI(env);
+         HReg   r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+         UShort shift = toUShort(op_unop == Iop_16HIto8 ? 8 : 16);
+         addInstr(env,
+                  PPCInstr_Shft(Pshft_SHR, True/*32bit shift*/,
+                                r_dst, r_src, PPCRH_Imm(False,shift)));
+         return r_dst;
+      }
+      case Iop_128HIto64: 
+         if (mode64) {
+            HReg rHi, rLo;
+            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+            return rHi; /* and abandon rLo .. poor wee thing :-) */
+         }
+         break;
+      case Iop_128to64:
+         if (mode64) {
+            HReg rHi, rLo;
+            iselInt128Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+            return rLo; /* similar stupid comment to the above ... */
+         }
+         break;
+      case Iop_1Uto32:
+      case Iop_1Uto8: {
+         HReg        r_dst = newVRegI(env);
+         PPCCondCode cond  = iselCondCode(env, e->Iex.Unop.arg);
+         addInstr(env, PPCInstr_Set(cond,r_dst));
+         return r_dst;
+      }
+      case Iop_1Sto8:
+      case Iop_1Sto16:
+      case Iop_1Sto32: {
+         /* could do better than this, but for now ... */
+         HReg        r_dst = newVRegI(env);
+         PPCCondCode cond  = iselCondCode(env, e->Iex.Unop.arg);
+         addInstr(env, PPCInstr_Set(cond,r_dst));
+         addInstr(env,
+                  PPCInstr_Shft(Pshft_SHL, True/*32bit shift*/,
+                                r_dst, r_dst, PPCRH_Imm(False,31)));
+         addInstr(env,
+                  PPCInstr_Shft(Pshft_SAR, True/*32bit shift*/,
+                                r_dst, r_dst, PPCRH_Imm(False,31)));
+         return r_dst;
+      }
+      case Iop_1Sto64: 
+         if (mode64) {
+            /* could do better than this, but for now ... */
+            HReg        r_dst = newVRegI(env);
+            PPCCondCode cond  = iselCondCode(env, e->Iex.Unop.arg);
+            addInstr(env, PPCInstr_Set(cond,r_dst));
+            addInstr(env, PPCInstr_Shft(Pshft_SHL, False/*64bit shift*/,
+                                        r_dst, r_dst, PPCRH_Imm(False,63)));
+            addInstr(env, PPCInstr_Shft(Pshft_SAR, False/*64bit shift*/,
+                                        r_dst, r_dst, PPCRH_Imm(False,63)));
+            return r_dst;
+         }
+         break;
+      case Iop_Clz32:
+      case Iop_Clz64: {
+         HReg r_src, r_dst;
+         PPCUnaryOp op_clz = (op_unop == Iop_Clz32) ? Pun_CLZ32 :
+                                                      Pun_CLZ64;
+         if (op_unop == Iop_Clz64 && !mode64)
+            goto irreducible;
+         /* Count leading zeroes. */
+         r_dst = newVRegI(env);
+         r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+         addInstr(env, PPCInstr_Unary(op_clz,r_dst,r_src));
+         return r_dst;
+      }
+
+      case Iop_Left8:
+      case Iop_Left32: 
+      case Iop_Left64: {
+         HReg r_src, r_dst;
+         if (op_unop == Iop_Left64 && !mode64)
+            goto irreducible;
+         r_dst = newVRegI(env);
+         r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+         addInstr(env, PPCInstr_Unary(Pun_NEG,r_dst,r_src));
+         addInstr(env, PPCInstr_Alu(Palu_OR, r_dst, r_dst, PPCRH_Reg(r_src)));
+         return r_dst;
+      }
+
+      case Iop_CmpwNEZ32: {
+         HReg r_dst = newVRegI(env);
+         HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+         addInstr(env, PPCInstr_Unary(Pun_NEG,r_dst,r_src));
+         addInstr(env, PPCInstr_Alu(Palu_OR, r_dst, r_dst, PPCRH_Reg(r_src)));
+         addInstr(env, PPCInstr_Shft(Pshft_SAR, True/*32bit shift*/, 
+                                     r_dst, r_dst, PPCRH_Imm(False, 31)));
+         return r_dst;
+      }
+
+      case Iop_CmpwNEZ64: {
+         HReg r_dst = newVRegI(env);
+         HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+         if (!mode64) goto irreducible;
+         addInstr(env, PPCInstr_Unary(Pun_NEG,r_dst,r_src));
+         addInstr(env, PPCInstr_Alu(Palu_OR, r_dst, r_dst, PPCRH_Reg(r_src)));
+         addInstr(env, PPCInstr_Shft(Pshft_SAR, False/*64bit shift*/, 
+                                     r_dst, r_dst, PPCRH_Imm(False, 63)));
+         return r_dst;
+      }
+
+      case Iop_V128to32: {
+         HReg        r_aligned16;
+         HReg        dst  = newVRegI(env);
+         HReg        vec  = iselVecExpr(env, e->Iex.Unop.arg);
+         PPCAMode *am_off0, *am_off12;
+         sub_from_sp( env, 32 );     // Move SP down 32 bytes
+
+         // get a quadword aligned address within our stack space
+         r_aligned16 = get_sp_aligned16( env );
+         am_off0  = PPCAMode_IR( 0, r_aligned16 );
+         am_off12 = PPCAMode_IR( 12,r_aligned16 );
+
+         // store vec, load low word to dst
+         addInstr(env,
+                  PPCInstr_AvLdSt( False/*store*/, 16, vec, am_off0 ));
+         addInstr(env,
+                  PPCInstr_Load( 4, dst, am_off12, mode64 ));
+
+         add_to_sp( env, 32 );       // Reset SP
+         return dst;
+      }
+
+      case Iop_V128to64:
+      case Iop_V128HIto64: 
+         if (mode64) {
+            HReg     r_aligned16;
+            HReg     dst = newVRegI(env);
+            HReg     vec = iselVecExpr(env, e->Iex.Unop.arg);
+            PPCAMode *am_off0, *am_off8;
+            sub_from_sp( env, 32 );     // Move SP down 32 bytes
+
+            // get a quadword aligned address within our stack space
+            r_aligned16 = get_sp_aligned16( env );
+            am_off0 = PPCAMode_IR( 0, r_aligned16 );
+            am_off8 = PPCAMode_IR( 8 ,r_aligned16 );
+
+            // store vec, load low word (+8) or high (+0) to dst
+            addInstr(env,
+                     PPCInstr_AvLdSt( False/*store*/, 16, vec, am_off0 ));
+            addInstr(env,
+                     PPCInstr_Load( 
+                        8, dst, 
+                        op_unop == Iop_V128HIto64 ? am_off0 : am_off8, 
+                        mode64 ));
+
+            add_to_sp( env, 32 );       // Reset SP
+            return dst;
+         }
+         break;
+      case Iop_16to8:
+      case Iop_32to8:
+      case Iop_32to16:
+      case Iop_64to8:
+         /* These are no-ops. */
+         return iselWordExpr_R(env, e->Iex.Unop.arg);
+         
+      /* ReinterpF64asI64(e) */
+      /* Given an IEEE754 double, produce an I64 with the same bit
+         pattern. */
+      case Iop_ReinterpF64asI64: 
+         if (mode64) {
+            PPCAMode *am_addr;
+            HReg fr_src = iselDblExpr(env, e->Iex.Unop.arg);
+            HReg r_dst  = newVRegI(env);
+
+            sub_from_sp( env, 16 );     // Move SP down 16 bytes
+            am_addr = PPCAMode_IR( 0, StackFramePtr(mode64) );
+
+            // store as F64
+            addInstr(env, PPCInstr_FpLdSt( False/*store*/, 8,
+                                           fr_src, am_addr ));
+            // load as Ity_I64
+            addInstr(env, PPCInstr_Load( 8, r_dst, am_addr, mode64 ));
+
+            add_to_sp( env, 16 );       // Reset SP
+            return r_dst;
+         }
+         break;
+
+      /* ReinterpF32asI32(e) */
+      /* Given an IEEE754 float, produce an I32 with the same bit
+         pattern. */
+      case Iop_ReinterpF32asI32: {
+         /* I believe this generates correct code for both 32- and
+            64-bit hosts. */
+         PPCAMode *am_addr;
+         HReg fr_src = iselFltExpr(env, e->Iex.Unop.arg);
+         HReg r_dst  = newVRegI(env);
+
+         sub_from_sp( env, 16 );     // Move SP down 16 bytes
+         am_addr = PPCAMode_IR( 0, StackFramePtr(mode64) );
+
+         // store as F32
+         addInstr(env, PPCInstr_FpLdSt( False/*store*/, 4,
+                                        fr_src, am_addr ));
+         // load as Ity_I32
+         addInstr(env, PPCInstr_Load( 4, r_dst, am_addr, mode64 ));
+
+         add_to_sp( env, 16 );       // Reset SP
+         return r_dst;
+      }
+
+      default: 
+         break;
+      }
+      break;
+   }
+
+   /* --------- GET --------- */
+   case Iex_Get: {
+      if (ty == Ity_I8  || ty == Ity_I16 ||
+          ty == Ity_I32 || ((ty == Ity_I64) && mode64)) {
+         HReg r_dst = newVRegI(env);
+         PPCAMode* am_addr = PPCAMode_IR( e->Iex.Get.offset,
+                                          GuestStatePtr(mode64) );
+         addInstr(env, PPCInstr_Load( toUChar(sizeofIRType(ty)), 
+                                      r_dst, am_addr, mode64 ));
+         return r_dst;
+      }
+      break;
+   }
+
+   case Iex_GetI: {
+      PPCAMode* src_am
+         = genGuestArrayOffset( env, e->Iex.GetI.descr,
+                                     e->Iex.GetI.ix, e->Iex.GetI.bias );
+      HReg r_dst = newVRegI(env);
+      if (mode64 && ty == Ity_I64) {
+         addInstr(env, PPCInstr_Load( toUChar(8),
+                                      r_dst, src_am, mode64 ));
+         return r_dst;
+      }
+      if ((!mode64) && ty == Ity_I32) {
+         addInstr(env, PPCInstr_Load( toUChar(4),
+                                      r_dst, src_am, mode64 ));
+         return r_dst;
+      }
+      break;
+   }
+
+   /* --------- CCALL --------- */
+   case Iex_CCall: {
+      HReg    r_dst = newVRegI(env);
+      vassert(ty == Ity_I32);
+
+      /* be very restrictive for now.  Only 32/64-bit ints allowed
+         for args, and 32 bits for return type. */
+      if (e->Iex.CCall.retty != Ity_I32)
+         goto irreducible;
+      
+      /* Marshal args, do the call, clear stack. */
+      doHelperCall( env, False, NULL,
+                    e->Iex.CCall.cee, e->Iex.CCall.args );
+
+      /* GPR3 now holds the destination address from Pin_Goto */
+      addInstr(env, mk_iMOVds_RR(r_dst, hregPPC_GPR3(mode64)));
+      return r_dst;
+   }
+      
+   /* --------- LITERAL --------- */
+   /* 32/16/8-bit literals */
+   case Iex_Const: {
+      Long l;
+      HReg r_dst = newVRegI(env);
+      IRConst* con = e->Iex.Const.con;
+      switch (con->tag) {
+         case Ico_U64: if (!mode64) goto irreducible;
+                       l = (Long)            con->Ico.U64; break;
+         case Ico_U32: l = (Long)(Int)       con->Ico.U32; break;
+         case Ico_U16: l = (Long)(Int)(Short)con->Ico.U16; break;
+         case Ico_U8:  l = (Long)(Int)(Char )con->Ico.U8;  break;
+         default:      vpanic("iselIntExpr_R.const(ppc)");
+      }
+      addInstr(env, PPCInstr_LI(r_dst, (ULong)l, mode64));
+      return r_dst;
+   }
+
+   /* --------- MULTIPLEX --------- */
+   case Iex_Mux0X: {
+      if ((ty == Ity_I8  || ty == Ity_I16 ||
+           ty == Ity_I32 || ((ty == Ity_I64) && mode64)) &&
+          typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
+         PPCCondCode  cc = mk_PPCCondCode( Pct_TRUE, Pcf_7EQ );
+         HReg   r_cond = iselWordExpr_R(env, e->Iex.Mux0X.cond);
+         HReg   rX     = iselWordExpr_R(env, e->Iex.Mux0X.exprX);
+         PPCRI* r0     = iselWordExpr_RI(env, e->Iex.Mux0X.expr0);
+         HReg   r_dst  = newVRegI(env);
+         HReg   r_tmp  = newVRegI(env);
+         addInstr(env, mk_iMOVds_RR(r_dst,rX));
+         addInstr(env, PPCInstr_Alu(Palu_AND, r_tmp,
+                                    r_cond, PPCRH_Imm(False,0xFF)));
+         addInstr(env, PPCInstr_Cmp(False/*unsigned*/, True/*32bit cmp*/,
+                                    7/*cr*/, r_tmp, PPCRH_Imm(False,0)));
+         addInstr(env, PPCInstr_CMov(cc,r_dst,r0));
+         return r_dst;
+      }
+      break;
+   }
+      
+   default: 
+      break;
+   } /* switch (e->tag) */
+
+
+   /* We get here if no pattern matched. */
+ irreducible:
+   ppIRExpr(e);
+   vpanic("iselIntExpr_R(ppc): cannot reduce tree");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expression auxiliaries              ---*/
+/*---------------------------------------------------------*/
+
+/* --------------------- AMODEs --------------------- */
+
+/* Return an AMode which computes the value of the specified
+   expression, possibly also adding insns to the code list as a
+   result.  The expression may only be a word-size one.
+*/
+
+static Bool uInt_fits_in_16_bits ( UInt u ) 
+{
+   /* Is u the same as the sign-extend of its lower 16 bits? */
+   Int i = u & 0xFFFF;
+   i <<= 16;
+   i >>= 16;
+   return toBool(u == (UInt)i);
+}
+
+static Bool uLong_fits_in_16_bits ( ULong u ) 
+{
+   /* Is u the same as the sign-extend of its lower 16 bits? */
+   Long i = u & 0xFFFFULL;
+   i <<= 48;
+   i >>= 48;
+   return toBool(u == (ULong)i);
+}
+
+static Bool uLong_is_4_aligned ( ULong u )
+{
+   return toBool((u & 3ULL) == 0);
+}
+
+static Bool sane_AMode ( ISelEnv* env, PPCAMode* am )
+{
+   Bool mode64 = env->mode64;
+   switch (am->tag) {
+   case Pam_IR:
+      /* Using uInt_fits_in_16_bits in 64-bit mode seems a bit bogus,
+         somehow, but I think it's OK. */
+      return toBool( hregClass(am->Pam.IR.base) == HRcGPR(mode64) && 
+                     hregIsVirtual(am->Pam.IR.base) && 
+                     uInt_fits_in_16_bits(am->Pam.IR.index) );
+   case Pam_RR:
+      return toBool( hregClass(am->Pam.RR.base) == HRcGPR(mode64) && 
+                     hregIsVirtual(am->Pam.RR.base) &&
+                     hregClass(am->Pam.RR.index) == HRcGPR(mode64) &&
+                     hregIsVirtual(am->Pam.IR.index) );
+   default:
+      vpanic("sane_AMode: unknown ppc amode tag");
+   }
+}
+
+static 
+PPCAMode* iselWordExpr_AMode ( ISelEnv* env, IRExpr* e, IRType xferTy )
+{
+   PPCAMode* am = iselWordExpr_AMode_wrk(env, e, xferTy);
+   vassert(sane_AMode(env, am));
+   return am;
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static PPCAMode* iselWordExpr_AMode_wrk ( ISelEnv* env, IRExpr* e, IRType xferTy )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+
+   if (env->mode64) {
+
+      /* If the data load/store type is I32 or I64, this amode might
+         be destined for use in ld/ldu/lwa/st/stu.  In which case
+         insist that if it comes out as an _IR, the immediate must
+         have its bottom two bits be zero.  This does assume that for
+         any other type (I8/I16/I128/F32/F64/V128) the amode will not
+         be parked in any such instruction.  But that seems a
+         reasonable assumption.  */
+      Bool aligned4imm = toBool(xferTy == Ity_I32 || xferTy == Ity_I64);
+
+      vassert(ty == Ity_I64);
+   
+      /* Add64(expr,i), where i == sign-extend of (i & 0xFFFF) */
+      if (e->tag == Iex_Binop 
+          && e->Iex.Binop.op == Iop_Add64
+          && e->Iex.Binop.arg2->tag == Iex_Const
+          && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64
+          && (aligned4imm  ? uLong_is_4_aligned(e->Iex.Binop.arg2
+                                                 ->Iex.Const.con->Ico.U64)
+                           : True)
+          && uLong_fits_in_16_bits(e->Iex.Binop.arg2
+                                    ->Iex.Const.con->Ico.U64)) {
+         return PPCAMode_IR( (Int)e->Iex.Binop.arg2->Iex.Const.con->Ico.U64,
+                             iselWordExpr_R(env, e->Iex.Binop.arg1) );
+      }
+      
+      /* Add64(expr,expr) */
+      if (e->tag == Iex_Binop 
+          && e->Iex.Binop.op == Iop_Add64) {
+         HReg r_base = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         HReg r_idx  = iselWordExpr_R(env, e->Iex.Binop.arg2);
+         return PPCAMode_RR( r_idx, r_base );
+      }
+
+   } else {
+
+      vassert(ty == Ity_I32);
+   
+      /* Add32(expr,i), where i == sign-extend of (i & 0xFFFF) */
+      if (e->tag == Iex_Binop 
+          && e->Iex.Binop.op == Iop_Add32
+          && e->Iex.Binop.arg2->tag == Iex_Const
+          && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32
+          && uInt_fits_in_16_bits(e->Iex.Binop.arg2
+                                   ->Iex.Const.con->Ico.U32)) {
+         return PPCAMode_IR( (Int)e->Iex.Binop.arg2->Iex.Const.con->Ico.U32,
+                             iselWordExpr_R(env, e->Iex.Binop.arg1) );
+      }
+      
+      /* Add32(expr,expr) */
+      if (e->tag == Iex_Binop 
+          && e->Iex.Binop.op == Iop_Add32) {
+         HReg r_base = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         HReg r_idx  = iselWordExpr_R(env, e->Iex.Binop.arg2);
+         return PPCAMode_RR( r_idx, r_base );
+      }
+
+   }
+
+   /* Doesn't match anything in particular.  Generate it into
+      a register and use that. */
+   return PPCAMode_IR( 0, iselWordExpr_R(env,e) );
+}
+
+
+/* --------------------- RH --------------------- */
+
+/* Compute an I8/I16/I32 (and I64, in 64-bit mode) into a RH
+   (reg-or-halfword-immediate).  It's important to specify whether the
+   immediate is to be regarded as signed or not.  If yes, this will
+   never return -32768 as an immediate; this guaranteed that all
+   signed immediates that are return can have their sign inverted if
+   need be. */
+
+static PPCRH* iselWordExpr_RH ( ISelEnv* env, Bool syned, IRExpr* e )
+{
+   PPCRH* ri = iselWordExpr_RH_wrk(env, syned, e);
+   /* sanity checks ... */
+   switch (ri->tag) {
+   case Prh_Imm:
+      vassert(ri->Prh.Imm.syned == syned);
+      if (syned)
+         vassert(ri->Prh.Imm.imm16 != 0x8000);
+      return ri;
+   case Prh_Reg:
+      vassert(hregClass(ri->Prh.Reg.reg) == HRcGPR(env->mode64));
+      vassert(hregIsVirtual(ri->Prh.Reg.reg));
+      return ri;
+   default:
+      vpanic("iselIntExpr_RH: unknown ppc RH tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static PPCRH* iselWordExpr_RH_wrk ( ISelEnv* env, Bool syned, IRExpr* e )
+{
+   ULong u;
+   Long  l;
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I8  || ty == Ity_I16 ||
+           ty == Ity_I32 || ((ty == Ity_I64) && env->mode64));
+
+   /* special case: immediate */
+   if (e->tag == Iex_Const) {
+      IRConst* con = e->Iex.Const.con;
+      /* What value are we aiming to generate? */
+      switch (con->tag) {
+      /* Note: Not sign-extending - we carry 'syned' around */
+      case Ico_U64: vassert(env->mode64);
+                    u =              con->Ico.U64; break;
+      case Ico_U32: u = 0xFFFFFFFF & con->Ico.U32; break;
+      case Ico_U16: u = 0x0000FFFF & con->Ico.U16; break;
+      case Ico_U8:  u = 0x000000FF & con->Ico.U8; break;
+      default:      vpanic("iselIntExpr_RH.Iex_Const(ppch)");
+      }
+      l = (Long)u;
+      /* Now figure out if it's representable. */
+      if (!syned && u <= 65535) {
+         return PPCRH_Imm(False/*unsigned*/, toUShort(u & 0xFFFF));
+      }
+      if (syned && l >= -32767 && l <= 32767) {
+         return PPCRH_Imm(True/*signed*/, toUShort(u & 0xFFFF));
+      }
+      /* no luck; use the Slow Way. */
+   }
+
+   /* default case: calculate into a register and return that */
+   return PPCRH_Reg( iselWordExpr_R ( env, e ) );
+}
+
+
+/* --------------------- RIs --------------------- */
+
+/* Calculate an expression into an PPCRI operand.  As with
+   iselIntExpr_R, the expression can have type 32, 16 or 8 bits, or,
+   in 64-bit mode, 64 bits. */
+
+static PPCRI* iselWordExpr_RI ( ISelEnv* env, IRExpr* e )
+{
+   PPCRI* ri = iselWordExpr_RI_wrk(env, e);
+   /* sanity checks ... */
+   switch (ri->tag) {
+   case Pri_Imm:
+      return ri;
+   case Pri_Reg:
+      vassert(hregClass(ri->Pri.Reg) == HRcGPR(env->mode64));
+      vassert(hregIsVirtual(ri->Pri.Reg));
+      return ri;
+   default:
+      vpanic("iselIntExpr_RI: unknown ppc RI tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static PPCRI* iselWordExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
+{
+   Long  l;
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I8  || ty == Ity_I16 ||
+           ty == Ity_I32 || ((ty == Ity_I64) && env->mode64));
+
+   /* special case: immediate */
+   if (e->tag == Iex_Const) {
+      IRConst* con = e->Iex.Const.con;
+      switch (con->tag) {
+      case Ico_U64: vassert(env->mode64);
+                    l = (Long)            con->Ico.U64; break;
+      case Ico_U32: l = (Long)(Int)       con->Ico.U32; break;
+      case Ico_U16: l = (Long)(Int)(Short)con->Ico.U16; break;
+      case Ico_U8:  l = (Long)(Int)(Char )con->Ico.U8;  break;
+      default:      vpanic("iselIntExpr_RI.Iex_Const(ppch)");
+      }
+      return PPCRI_Imm((ULong)l);
+   }
+
+   /* default case: calculate into a register and return that */
+   return PPCRI_Reg( iselWordExpr_R ( env, e ) );
+}
+
+
+/* --------------------- RH5u --------------------- */
+
+/* Compute an I8 into a reg-or-5-bit-unsigned-immediate, the latter
+   being an immediate in the range 1 .. 31 inclusive.  Used for doing
+   shift amounts.  Only used in 32-bit mode. */
+
+static PPCRH* iselWordExpr_RH5u ( ISelEnv* env, IRExpr* e )
+{
+   PPCRH* ri;
+   vassert(!env->mode64);
+   ri = iselWordExpr_RH5u_wrk(env, e);
+   /* sanity checks ... */
+   switch (ri->tag) {
+   case Prh_Imm:
+      vassert(ri->Prh.Imm.imm16 >= 1 && ri->Prh.Imm.imm16 <= 31);
+      vassert(!ri->Prh.Imm.syned);
+      return ri;
+   case Prh_Reg:
+      vassert(hregClass(ri->Prh.Reg.reg) == HRcGPR(env->mode64));
+      vassert(hregIsVirtual(ri->Prh.Reg.reg));
+      return ri;
+   default:
+      vpanic("iselIntExpr_RH5u: unknown ppc RI tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static PPCRH* iselWordExpr_RH5u_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I8);
+
+   /* special case: immediate */
+   if (e->tag == Iex_Const
+       && e->Iex.Const.con->tag == Ico_U8
+       && e->Iex.Const.con->Ico.U8 >= 1
+       && e->Iex.Const.con->Ico.U8 <= 31) {
+      return PPCRH_Imm(False/*unsigned*/, e->Iex.Const.con->Ico.U8);
+   }
+
+   /* default case: calculate into a register and return that */
+   return PPCRH_Reg( iselWordExpr_R ( env, e ) );
+}
+
+
+/* --------------------- RH6u --------------------- */
+
+/* Compute an I8 into a reg-or-6-bit-unsigned-immediate, the latter
+   being an immediate in the range 1 .. 63 inclusive.  Used for doing
+   shift amounts.  Only used in 64-bit mode. */
+
+static PPCRH* iselWordExpr_RH6u ( ISelEnv* env, IRExpr* e )
+{
+   PPCRH* ri; 
+   vassert(env->mode64);
+   ri = iselWordExpr_RH6u_wrk(env, e);
+   /* sanity checks ... */
+   switch (ri->tag) {
+   case Prh_Imm:
+      vassert(ri->Prh.Imm.imm16 >= 1 && ri->Prh.Imm.imm16 <= 63);
+      vassert(!ri->Prh.Imm.syned);
+      return ri;
+   case Prh_Reg:
+      vassert(hregClass(ri->Prh.Reg.reg) == HRcGPR(env->mode64));
+      vassert(hregIsVirtual(ri->Prh.Reg.reg));
+      return ri;
+   default:
+      vpanic("iselIntExpr_RH6u: unknown ppc64 RI tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static PPCRH* iselWordExpr_RH6u_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I8);
+
+   /* special case: immediate */
+   if (e->tag == Iex_Const
+       && e->Iex.Const.con->tag == Ico_U8
+       && e->Iex.Const.con->Ico.U8 >= 1
+       && e->Iex.Const.con->Ico.U8 <= 63) {
+      return PPCRH_Imm(False/*unsigned*/, e->Iex.Const.con->Ico.U8);
+   }
+
+   /* default case: calculate into a register and return that */
+   return PPCRH_Reg( iselWordExpr_R ( env, e ) );
+}
+
+
+/* --------------------- CONDCODE --------------------- */
+
+/* Generate code to evaluated a bit-typed expression, returning the
+   condition code which would correspond when the expression would
+   notionally have returned 1. */
+
+static PPCCondCode iselCondCode ( ISelEnv* env, IRExpr* e )
+{
+   /* Uh, there's nothing we can sanity check here, unfortunately. */
+   return iselCondCode_wrk(env,e);
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static PPCCondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
+{
+   vassert(e);
+   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
+
+   /* Constant 1:Bit */
+   if (e->tag == Iex_Const && e->Iex.Const.con->Ico.U1 == True) {
+      // Make a compare that will always be true:
+      HReg r_zero = newVRegI(env);
+      addInstr(env, PPCInstr_LI(r_zero, 0, env->mode64));
+      addInstr(env, PPCInstr_Cmp(False/*unsigned*/, True/*32bit cmp*/,
+                                 7/*cr*/, r_zero, PPCRH_Reg(r_zero)));
+      return mk_PPCCondCode( Pct_TRUE, Pcf_7EQ );
+   }
+
+   /* Not1(...) */
+   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
+      /* Generate code for the arg, and negate the test condition */
+      PPCCondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+      cond.test = invertCondTest(cond.test);
+      return cond;
+   }
+
+   /* --- patterns rooted at: 32to1 or 64to1 --- */
+
+   /* 32to1, 64to1 */
+   if (e->tag == Iex_Unop &&
+       (e->Iex.Unop.op == Iop_32to1 || e->Iex.Unop.op == Iop_64to1)) {
+      HReg src = iselWordExpr_R(env, e->Iex.Unop.arg);
+      HReg tmp = newVRegI(env);
+      /* could do better, probably -- andi. */
+      addInstr(env, PPCInstr_Alu(Palu_AND, tmp,
+                                 src, PPCRH_Imm(False,1)));
+      addInstr(env, PPCInstr_Cmp(False/*unsigned*/, True/*32bit cmp*/,
+                                 7/*cr*/, tmp, PPCRH_Imm(False,1)));
+      return mk_PPCCondCode( Pct_TRUE, Pcf_7EQ );
+   }
+
+   /* --- patterns rooted at: CmpNEZ8 --- */
+
+   /* CmpNEZ8(x) */
+   /* could do better -- andi. */
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_CmpNEZ8) {
+      HReg arg = iselWordExpr_R(env, e->Iex.Unop.arg);
+      HReg tmp = newVRegI(env);
+      addInstr(env, PPCInstr_Alu(Palu_AND, tmp, arg,
+                                 PPCRH_Imm(False,0xFF)));
+      addInstr(env, PPCInstr_Cmp(False/*unsigned*/, True/*32bit cmp*/,
+                                 7/*cr*/, tmp, PPCRH_Imm(False,0)));
+      return mk_PPCCondCode( Pct_FALSE, Pcf_7EQ );
+   }
+
+   /* --- patterns rooted at: CmpNEZ32 --- */
+
+   /* CmpNEZ32(x) */
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_CmpNEZ32) {
+      HReg r1 = iselWordExpr_R(env, e->Iex.Unop.arg);
+      addInstr(env, PPCInstr_Cmp(False/*unsigned*/, True/*32bit cmp*/,
+                                 7/*cr*/, r1, PPCRH_Imm(False,0)));
+      return mk_PPCCondCode( Pct_FALSE, Pcf_7EQ );
+   }
+
+   /* --- patterns rooted at: Cmp*32* --- */
+
+   /* Cmp*32*(x,y) */
+   if (e->tag == Iex_Binop 
+       && (e->Iex.Binop.op == Iop_CmpEQ32
+           || e->Iex.Binop.op == Iop_CmpNE32
+           || e->Iex.Binop.op == Iop_CmpLT32S
+           || e->Iex.Binop.op == Iop_CmpLT32U
+           || e->Iex.Binop.op == Iop_CmpLE32S
+           || e->Iex.Binop.op == Iop_CmpLE32U)) {
+      Bool syned = (e->Iex.Binop.op == Iop_CmpLT32S ||
+                    e->Iex.Binop.op == Iop_CmpLE32S);
+      HReg   r1  = iselWordExpr_R(env, e->Iex.Binop.arg1);
+      PPCRH* ri2 = iselWordExpr_RH(env, syned, e->Iex.Binop.arg2);
+      addInstr(env, PPCInstr_Cmp(syned, True/*32bit cmp*/,
+                                 7/*cr*/, r1, ri2));
+
+      switch (e->Iex.Binop.op) {
+      case Iop_CmpEQ32:  return mk_PPCCondCode( Pct_TRUE,  Pcf_7EQ );
+      case Iop_CmpNE32:  return mk_PPCCondCode( Pct_FALSE, Pcf_7EQ );
+      case Iop_CmpLT32U: return mk_PPCCondCode( Pct_TRUE,  Pcf_7LT );
+      case Iop_CmpLE32U: return mk_PPCCondCode( Pct_FALSE, Pcf_7GT );
+      default: vpanic("iselCondCode(ppc): CmpXX32");
+      }
+   }
+
+   /* --- patterns rooted at: CmpNEZ64 --- */
+
+   /* CmpNEZ64 */
+   if (e->tag == Iex_Unop 
+       && e->Iex.Unop.op == Iop_CmpNEZ64) {
+      if (!env->mode64) {
+         HReg hi, lo;
+         HReg tmp = newVRegI(env);
+         iselInt64Expr( &hi, &lo, env, e->Iex.Unop.arg );
+         addInstr(env, PPCInstr_Alu(Palu_OR, tmp, lo, PPCRH_Reg(hi)));
+         addInstr(env, PPCInstr_Cmp(False/*sign*/, True/*32bit cmp*/,
+                                    7/*cr*/, tmp,PPCRH_Imm(False,0)));
+         return mk_PPCCondCode( Pct_FALSE, Pcf_7EQ );
+      } else {  // mode64
+         HReg r_src = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         addInstr(env, PPCInstr_Cmp(False/*sign*/, False/*64bit cmp*/,
+                                    7/*cr*/, r_src,PPCRH_Imm(False,0)));
+         return mk_PPCCondCode( Pct_FALSE, Pcf_7EQ );
+      }
+   }
+
+   /* --- patterns rooted at: Cmp*64* --- */
+
+   /* Cmp*64*(x,y) */
+   if (e->tag == Iex_Binop 
+       && (e->Iex.Binop.op == Iop_CmpEQ64
+           || e->Iex.Binop.op == Iop_CmpNE64
+           || e->Iex.Binop.op == Iop_CmpLT64S
+           || e->Iex.Binop.op == Iop_CmpLT64U
+           || e->Iex.Binop.op == Iop_CmpLE64S
+           || e->Iex.Binop.op == Iop_CmpLE64U)) {
+      Bool   syned = (e->Iex.Binop.op == Iop_CmpLT64S ||
+                      e->Iex.Binop.op == Iop_CmpLE64S);
+      HReg    r1 = iselWordExpr_R(env, e->Iex.Binop.arg1);
+      PPCRH* ri2 = iselWordExpr_RH(env, syned, e->Iex.Binop.arg2);
+      vassert(env->mode64);
+      addInstr(env, PPCInstr_Cmp(syned, False/*64bit cmp*/,
+                                 7/*cr*/, r1, ri2));
+
+      switch (e->Iex.Binop.op) {
+      case Iop_CmpEQ64:  return mk_PPCCondCode( Pct_TRUE,  Pcf_7EQ );
+      case Iop_CmpNE64:  return mk_PPCCondCode( Pct_FALSE, Pcf_7EQ );
+      case Iop_CmpLT64U: return mk_PPCCondCode( Pct_TRUE,  Pcf_7LT );
+      case Iop_CmpLE64U: return mk_PPCCondCode( Pct_FALSE, Pcf_7GT );
+      default: vpanic("iselCondCode(ppc): CmpXX64");
+      }
+   }
+
+   /* var */
+   if (e->tag == Iex_RdTmp) {
+      HReg r_src      = lookupIRTemp(env, e->Iex.RdTmp.tmp);
+      HReg src_masked = newVRegI(env);
+      addInstr(env,
+               PPCInstr_Alu(Palu_AND, src_masked,
+                            r_src, PPCRH_Imm(False,1)));
+      addInstr(env,
+               PPCInstr_Cmp(False/*unsigned*/, True/*32bit cmp*/,
+                            7/*cr*/, src_masked, PPCRH_Imm(False,1)));
+      return mk_PPCCondCode( Pct_TRUE, Pcf_7EQ );
+   }
+
+   vex_printf("iselCondCode(ppc): No such tag(%u)\n", e->tag);
+   ppIRExpr(e);
+   vpanic("iselCondCode(ppc)");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expressions (128 bit)               ---*/
+/*---------------------------------------------------------*/
+
+/* 64-bit mode ONLY: compute a 128-bit value into a register pair,
+   which is returned as the first two parameters.  As with
+   iselWordExpr_R, these may be either real or virtual regs; in any
+   case they must not be changed by subsequent code emitted by the
+   caller.  */
+
+static void iselInt128Expr ( HReg* rHi, HReg* rLo,
+                             ISelEnv* env, IRExpr* e )
+{
+   vassert(env->mode64);
+   iselInt128Expr_wrk(rHi, rLo, env, e);
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(*rHi) == HRcGPR(env->mode64));
+   vassert(hregIsVirtual(*rHi));
+   vassert(hregClass(*rLo) == HRcGPR(env->mode64));
+   vassert(hregIsVirtual(*rLo));
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static void iselInt128Expr_wrk ( HReg* rHi, HReg* rLo,
+                                 ISelEnv* env, IRExpr* e )
+{
+   vassert(e);
+   vassert(typeOfIRExpr(env->type_env,e) == Ity_I128);
+
+   /* read 128-bit IRTemp */
+   if (e->tag == Iex_RdTmp) {
+      lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
+      return;
+   }
+
+   /* --------- BINARY ops --------- */
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+      /* 64 x 64 -> 128 multiply */
+      case Iop_MullU64:
+      case Iop_MullS64: {
+         HReg     tLo     = newVRegI(env);
+         HReg     tHi     = newVRegI(env);
+         Bool     syned   = toBool(e->Iex.Binop.op == Iop_MullS64);
+         HReg     r_srcL  = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         HReg     r_srcR  = iselWordExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, PPCInstr_MulL(False/*signedness irrelevant*/, 
+                                     False/*lo64*/, False/*64bit mul*/,
+                                     tLo, r_srcL, r_srcR));
+         addInstr(env, PPCInstr_MulL(syned,
+                                     True/*hi64*/, False/*64bit mul*/,
+                                     tHi, r_srcL, r_srcR));
+         *rHi = tHi;
+         *rLo = tLo;
+         return;
+      }
+
+      /* 64HLto128(e1,e2) */
+      case Iop_64HLto128:
+         *rHi = iselWordExpr_R(env, e->Iex.Binop.arg1);
+         *rLo = iselWordExpr_R(env, e->Iex.Binop.arg2);
+         return;
+
+      default: 
+         break;
+      }
+   } /* if (e->tag == Iex_Binop) */
+
+
+   /* --------- UNARY ops --------- */
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+      default:
+         break;
+      }
+   } /* if (e->tag == Iex_Unop) */
+
+   vex_printf("iselInt128Expr(ppc64): No such tag(%u)\n", e->tag);
+   ppIRExpr(e);
+   vpanic("iselInt128Expr(ppc64)");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expressions (64 bit)                ---*/
+/*---------------------------------------------------------*/
+
+/* 32-bit mode ONLY: compute a 64-bit value into a register pair,
+   which is returned as the first two parameters.  As with
+   iselIntExpr_R, these may be either real or virtual regs; in any
+   case they must not be changed by subsequent code emitted by the
+   caller.  */
+
+static void iselInt64Expr ( HReg* rHi, HReg* rLo,
+                            ISelEnv* env, IRExpr* e )
+{
+   vassert(!env->mode64);
+   iselInt64Expr_wrk(rHi, rLo, env, e);
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(*rHi) == HRcInt32);
+   vassert(hregIsVirtual(*rHi));
+   vassert(hregClass(*rLo) == HRcInt32);
+   vassert(hregIsVirtual(*rLo));
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo,
+                                ISelEnv* env, IRExpr* e )
+{
+   vassert(e);
+   vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
+
+   /* 64-bit load */
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_BE) {
+      HReg tLo    = newVRegI(env);
+      HReg tHi    = newVRegI(env);
+      HReg r_addr = iselWordExpr_R(env, e->Iex.Load.addr);
+      vassert(!env->mode64);
+      addInstr(env, PPCInstr_Load( 4/*byte-load*/,
+                                   tHi, PPCAMode_IR( 0, r_addr ), 
+                                   False/*32-bit insn please*/) );
+      addInstr(env, PPCInstr_Load( 4/*byte-load*/, 
+                                   tLo, PPCAMode_IR( 4, r_addr ), 
+                                   False/*32-bit insn please*/) );
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   /* 64-bit literal */
+   if (e->tag == Iex_Const) {
+      ULong w64 = e->Iex.Const.con->Ico.U64;
+      UInt  wHi = ((UInt)(w64 >> 32)) & 0xFFFFFFFF;
+      UInt  wLo = ((UInt)w64) & 0xFFFFFFFF;
+      HReg  tLo = newVRegI(env);
+      HReg  tHi = newVRegI(env);
+      vassert(e->Iex.Const.con->tag == Ico_U64);
+      addInstr(env, PPCInstr_LI(tHi, (Long)(Int)wHi, False/*mode32*/));
+      addInstr(env, PPCInstr_LI(tLo, (Long)(Int)wLo, False/*mode32*/));
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   /* read 64-bit IRTemp */
+   if (e->tag == Iex_RdTmp) {
+      lookupIRTempPair( rHi, rLo, env, e->Iex.RdTmp.tmp);
+      return;
+   }
+
+   /* 64-bit GET */
+   if (e->tag == Iex_Get) {
+      PPCAMode* am_addr = PPCAMode_IR( e->Iex.Get.offset,
+                                       GuestStatePtr(False/*mode32*/) );
+      PPCAMode* am_addr4 = advance4(env, am_addr);
+      HReg tLo = newVRegI(env);
+      HReg tHi = newVRegI(env);
+      addInstr(env, PPCInstr_Load( 4, tHi, am_addr,  False/*mode32*/ ));
+      addInstr(env, PPCInstr_Load( 4, tLo, am_addr4, False/*mode32*/ ));
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   /* 64-bit Mux0X */
+   if (e->tag == Iex_Mux0X) {
+      HReg e0Lo, e0Hi, eXLo, eXHi;
+      HReg tLo = newVRegI(env);
+      HReg tHi = newVRegI(env);
+      
+      PPCCondCode cc = mk_PPCCondCode( Pct_TRUE, Pcf_7EQ );
+      HReg r_cond = iselWordExpr_R(env, e->Iex.Mux0X.cond);
+      HReg r_tmp  = newVRegI(env);
+      
+      iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
+      iselInt64Expr(&eXHi, &eXLo, env, e->Iex.Mux0X.exprX);
+      addInstr(env, mk_iMOVds_RR(tHi,eXHi));
+      addInstr(env, mk_iMOVds_RR(tLo,eXLo));
+      
+      addInstr(env, PPCInstr_Alu(Palu_AND, 
+                                 r_tmp, r_cond, PPCRH_Imm(False,0xFF)));
+      addInstr(env, PPCInstr_Cmp(False/*unsigned*/, True/*32bit cmp*/, 
+                                 7/*cr*/, r_tmp, PPCRH_Imm(False,0)));
+      
+      addInstr(env, PPCInstr_CMov(cc,tHi,PPCRI_Reg(e0Hi)));
+      addInstr(env, PPCInstr_CMov(cc,tLo,PPCRI_Reg(e0Lo)));
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   /* --------- BINARY ops --------- */
+   if (e->tag == Iex_Binop) {
+      IROp op_binop = e->Iex.Binop.op;
+      switch (op_binop) {
+         /* 32 x 32 -> 64 multiply */
+         case Iop_MullU32:
+         case Iop_MullS32: {
+            HReg     tLo     = newVRegI(env);
+            HReg     tHi     = newVRegI(env);
+            Bool     syned   = toBool(op_binop == Iop_MullS32);
+            HReg     r_srcL  = iselWordExpr_R(env, e->Iex.Binop.arg1);
+            HReg     r_srcR  = iselWordExpr_R(env, e->Iex.Binop.arg2);
+            addInstr(env, PPCInstr_MulL(False/*signedness irrelevant*/, 
+                                        False/*lo32*/, True/*32bit mul*/,
+                                        tLo, r_srcL, r_srcR));
+            addInstr(env, PPCInstr_MulL(syned,
+                                        True/*hi32*/, True/*32bit mul*/,
+                                        tHi, r_srcL, r_srcR));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* Or64/And64/Xor64 */
+         case Iop_Or64:
+         case Iop_And64:
+         case Iop_Xor64: {
+            HReg xLo, xHi, yLo, yHi;
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            PPCAluOp op = (op_binop == Iop_Or64) ? Palu_OR :
+                          (op_binop == Iop_And64) ? Palu_AND : Palu_XOR;
+            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
+            addInstr(env, PPCInstr_Alu(op, tHi, xHi, PPCRH_Reg(yHi)));
+            addInstr(env, PPCInstr_Alu(op, tLo, xLo, PPCRH_Reg(yLo)));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* Add64 */
+         case Iop_Add64: {
+            HReg xLo, xHi, yLo, yHi;
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
+            addInstr(env, PPCInstr_AddSubC( True/*add*/, True /*set carry*/,
+                                            tLo, xLo, yLo));
+            addInstr(env, PPCInstr_AddSubC( True/*add*/, False/*read carry*/,
+                                            tHi, xHi, yHi));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* 32HLto64(e1,e2) */
+         case Iop_32HLto64:
+            *rHi = iselWordExpr_R(env, e->Iex.Binop.arg1);
+            *rLo = iselWordExpr_R(env, e->Iex.Binop.arg2);
+            return;
+
+         /* F64toI64S */
+         case Iop_F64toI64S: {
+            HReg      tLo     = newVRegI(env);
+            HReg      tHi     = newVRegI(env);
+            HReg      r1      = StackFramePtr(env->mode64);
+            PPCAMode* zero_r1 = PPCAMode_IR( 0, r1 );
+            PPCAMode* four_r1 = PPCAMode_IR( 4, r1 );
+            HReg      fsrc    = iselDblExpr(env, e->Iex.Binop.arg2);
+            HReg      ftmp    = newVRegF(env);
+
+            vassert(!env->mode64);
+            /* Set host rounding mode */
+            set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+
+            sub_from_sp( env, 16 );
+            addInstr(env, PPCInstr_FpCftI(False/*F->I*/, False/*int64*/,
+                                          ftmp, fsrc));
+            addInstr(env, PPCInstr_FpLdSt(False/*store*/, 8, ftmp, zero_r1));
+            addInstr(env, PPCInstr_Load(4, tHi, zero_r1, False/*mode32*/));
+            addInstr(env, PPCInstr_Load(4, tLo, four_r1, False/*mode32*/));
+            add_to_sp( env, 16 );
+
+            ///* Restore default FPU rounding. */
+            //set_FPU_rounding_default( env );
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         default: 
+            break;
+      }
+   } /* if (e->tag == Iex_Binop) */
+
+
+   /* --------- UNARY ops --------- */
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+
+      /* CmpwNEZ64(e) */
+      case Iop_CmpwNEZ64: {
+         HReg argHi, argLo;
+         HReg tmp1  = newVRegI(env);
+         HReg tmp2  = newVRegI(env);
+         iselInt64Expr(&argHi, &argLo, env, e->Iex.Unop.arg);
+         /* tmp1 = argHi | argLo */
+         addInstr(env, PPCInstr_Alu(Palu_OR, tmp1, argHi, PPCRH_Reg(argLo)));
+         /* tmp2 = (tmp1 | -tmp1) >>s 31 */
+         addInstr(env, PPCInstr_Unary(Pun_NEG,tmp2,tmp1));
+         addInstr(env, PPCInstr_Alu(Palu_OR, tmp2, tmp2, PPCRH_Reg(tmp1)));
+         addInstr(env, PPCInstr_Shft(Pshft_SAR, True/*32bit shift*/, 
+                                     tmp2, tmp2, PPCRH_Imm(False, 31)));
+         *rHi = tmp2;
+         *rLo = tmp2; /* yes, really tmp2 */
+         return;
+      }
+
+      /* Left64 */
+      case Iop_Left64: {
+         HReg argHi, argLo;
+         HReg zero32 = newVRegI(env);
+         HReg resHi  = newVRegI(env);
+         HReg resLo  = newVRegI(env);
+         iselInt64Expr(&argHi, &argLo, env, e->Iex.Unop.arg);
+         vassert(env->mode64 == False);
+         addInstr(env, PPCInstr_LI(zero32, 0, env->mode64));
+         /* resHi:resLo = - argHi:argLo */
+         addInstr(env, PPCInstr_AddSubC( False/*sub*/, True/*set carry*/,
+                                         resLo, zero32, argLo ));
+         addInstr(env, PPCInstr_AddSubC( False/*sub*/, False/*read carry*/,
+                                         resHi, zero32, argHi ));
+         /* resHi:resLo |= srcHi:srcLo */
+         addInstr(env, PPCInstr_Alu(Palu_OR, resLo, resLo, PPCRH_Reg(argLo)));
+         addInstr(env, PPCInstr_Alu(Palu_OR, resHi, resHi, PPCRH_Reg(argHi)));
+         *rHi = resHi;
+         *rLo = resLo;
+         return;
+      }
+
+      /* 32Sto64(e) */
+      case Iop_32Sto64: {
+         HReg tHi = newVRegI(env);
+         HReg src = iselWordExpr_R(env, e->Iex.Unop.arg);
+         addInstr(env, PPCInstr_Shft(Pshft_SAR, True/*32bit shift*/,
+                                     tHi, src, PPCRH_Imm(False,31)));
+         *rHi = tHi;
+         *rLo = src;
+         return;
+      }
+
+      /* 32Uto64(e) */
+      case Iop_32Uto64: {
+         HReg tHi = newVRegI(env);
+         HReg tLo = iselWordExpr_R(env, e->Iex.Unop.arg);
+         addInstr(env, PPCInstr_LI(tHi, 0, False/*mode32*/));
+         *rHi = tHi;
+         *rLo = tLo;
+         return;
+      }
+
+      /* V128{HI}to64 */
+      case Iop_V128HIto64:
+      case Iop_V128to64: {
+         HReg r_aligned16;
+         Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 0 : 8;
+         HReg tLo = newVRegI(env);
+         HReg tHi = newVRegI(env);
+         HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
+         PPCAMode *am_off0, *am_offLO, *am_offHI;
+         sub_from_sp( env, 32 );     // Move SP down 32 bytes
+         
+         // get a quadword aligned address within our stack space
+         r_aligned16 = get_sp_aligned16( env );
+         am_off0  = PPCAMode_IR( 0,     r_aligned16 );
+         am_offHI = PPCAMode_IR( off,   r_aligned16 );
+         am_offLO = PPCAMode_IR( off+4, r_aligned16 );
+         
+         // store as Vec128
+         addInstr(env,
+                  PPCInstr_AvLdSt( False/*store*/, 16, vec, am_off0 ));
+         
+         // load hi,lo words (of hi/lo half of vec) as Ity_I32's
+         addInstr(env,
+                  PPCInstr_Load( 4, tHi, am_offHI, False/*mode32*/ ));
+         addInstr(env,
+                  PPCInstr_Load( 4, tLo, am_offLO, False/*mode32*/ ));
+         
+         add_to_sp( env, 32 );       // Reset SP
+         *rHi = tHi;
+         *rLo = tLo;
+         return;
+      }
+
+      /* could do better than this, but for now ... */
+      case Iop_1Sto64: {
+         HReg tLo = newVRegI(env);
+         HReg tHi = newVRegI(env);
+         PPCCondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+         addInstr(env, PPCInstr_Set(cond,tLo));
+         addInstr(env, PPCInstr_Shft(Pshft_SHL, True/*32bit shift*/,
+                                     tLo, tLo, PPCRH_Imm(False,31)));
+         addInstr(env, PPCInstr_Shft(Pshft_SAR, True/*32bit shift*/,
+                                     tLo, tLo, PPCRH_Imm(False,31)));
+         addInstr(env, mk_iMOVds_RR(tHi, tLo));
+         *rHi = tHi;
+         *rLo = tLo;
+         return;
+      }
+
+      /* ReinterpF64asI64(e) */
+      /* Given an IEEE754 double, produce an I64 with the same bit
+         pattern. */
+      case Iop_ReinterpF64asI64: {
+         PPCAMode *am_addr0, *am_addr1;
+         HReg fr_src  = iselDblExpr(env, e->Iex.Unop.arg);
+         HReg r_dstLo = newVRegI(env);
+         HReg r_dstHi = newVRegI(env);
+         
+         sub_from_sp( env, 16 );     // Move SP down 16 bytes
+         am_addr0 = PPCAMode_IR( 0, StackFramePtr(False/*mode32*/) );
+         am_addr1 = PPCAMode_IR( 4, StackFramePtr(False/*mode32*/) );
+
+         // store as F64
+         addInstr(env, PPCInstr_FpLdSt( False/*store*/, 8,
+                                        fr_src, am_addr0 ));
+         
+         // load hi,lo as Ity_I32's
+         addInstr(env, PPCInstr_Load( 4, r_dstHi,
+                                      am_addr0, False/*mode32*/ ));
+         addInstr(env, PPCInstr_Load( 4, r_dstLo,
+                                      am_addr1, False/*mode32*/ ));
+         *rHi = r_dstHi;
+         *rLo = r_dstLo;
+         
+         add_to_sp( env, 16 );       // Reset SP
+         return;
+      }
+
+      default:
+         break;
+      }
+   } /* if (e->tag == Iex_Unop) */
+
+   vex_printf("iselInt64Expr(ppc): No such tag(%u)\n", e->tag);
+   ppIRExpr(e);
+   vpanic("iselInt64Expr(ppc)");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Floating point expressions (32 bit)         ---*/
+/*---------------------------------------------------------*/
+
+/* Nothing interesting here; really just wrappers for
+   64-bit stuff. */
+
+static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselFltExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcFlt64); /* yes, really Flt64 */
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_F32);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_BE) {
+      PPCAMode* am_addr;
+      HReg r_dst = newVRegF(env);
+      vassert(e->Iex.Load.ty == Ity_F32);
+      am_addr = iselWordExpr_AMode(env, e->Iex.Load.addr, Ity_F32/*xfer*/);
+      addInstr(env, PPCInstr_FpLdSt(True/*load*/, 4, r_dst, am_addr));
+      return r_dst;
+   }
+
+   if (e->tag == Iex_Get) {
+      HReg r_dst = newVRegF(env);
+      PPCAMode* am_addr = PPCAMode_IR( e->Iex.Get.offset,
+                                       GuestStatePtr(env->mode64) );
+      addInstr(env, PPCInstr_FpLdSt( True/*load*/, 4, r_dst, am_addr ));
+      return r_dst;
+   }
+
+   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_TruncF64asF32) {
+      /* This is quite subtle.  The only way to do the relevant
+         truncation is to do a single-precision store and then a
+         double precision load to get it back into a register.  The
+         problem is, if the data is then written to memory a second
+         time, as in
+
+            STbe(...) = TruncF64asF32(...)
+
+         then will the second truncation further alter the value?  The
+         answer is no: flds (as generated here) followed by fsts
+         (generated for the STbe) is the identity function on 32-bit
+         floats, so we are safe.
+
+         Another upshot of this is that if iselStmt can see the
+         entirety of
+
+            STbe(...) = TruncF64asF32(arg)
+
+         then it can short circuit having to deal with TruncF64asF32
+         individually; instead just compute arg into a 64-bit FP
+         register and do 'fsts' (since that itself does the
+         truncation).
+
+         We generate pretty poor code here (should be ok both for
+         32-bit and 64-bit mode); but it is expected that for the most
+         part the latter optimisation will apply and hence this code
+         will not often be used.
+      */
+      HReg      fsrc    = iselDblExpr(env, e->Iex.Unop.arg);
+      HReg      fdst    = newVRegF(env);
+      PPCAMode* zero_r1 = PPCAMode_IR( 0, StackFramePtr(env->mode64) );
+
+      sub_from_sp( env, 16 );
+      // store as F32, hence truncating
+      addInstr(env, PPCInstr_FpLdSt( False/*store*/, 4,
+                                     fsrc, zero_r1 ));
+      // and reload.  Good huh?! (sigh)
+      addInstr(env, PPCInstr_FpLdSt( True/*load*/, 4,
+                                     fdst, zero_r1 ));
+      add_to_sp( env, 16 );
+      return fdst;
+   }
+
+   vex_printf("iselFltExpr(ppc): No such tag(%u)\n", e->tag);
+   ppIRExpr(e);
+   vpanic("iselFltExpr_wrk(ppc)");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Floating point expressions (64 bit)         ---*/
+/*---------------------------------------------------------*/
+
+/* Compute a 64-bit floating point value into a register, the identity
+   of which is returned.  As with iselIntExpr_R, the reg may be either
+   real or virtual; in any case it must not be changed by subsequent
+   code emitted by the caller.  */
+
+/* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
+
+    Type                  S (1 bit)   E (11 bits)   F (52 bits)
+    ----                  ---------   -----------   -----------
+    signalling NaN        u           2047 (max)    .0uuuuu---u
+                                                    (with at least
+                                                     one 1 bit)
+    quiet NaN             u           2047 (max)    .1uuuuu---u
+
+    negative infinity     1           2047 (max)    .000000---0
+
+    positive infinity     0           2047 (max)    .000000---0
+
+    negative zero         1           0             .000000---0
+
+    positive zero         0           0             .000000---0
+*/
+
+static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselDblExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcFlt64);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   Bool mode64 = env->mode64;
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(e);
+   vassert(ty == Ity_F64);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   /* --------- LITERAL --------- */
+   if (e->tag == Iex_Const) {
+      union { UInt u32x2[2]; ULong u64; Double f64; } u;
+      vassert(sizeof(u) == 8);
+      vassert(sizeof(u.u64) == 8);
+      vassert(sizeof(u.f64) == 8);
+      vassert(sizeof(u.u32x2) == 8);
+
+      if (e->Iex.Const.con->tag == Ico_F64) {
+         u.f64 = e->Iex.Const.con->Ico.F64;
+      }
+      else if (e->Iex.Const.con->tag == Ico_F64i) {
+         u.u64 = e->Iex.Const.con->Ico.F64i;
+      }
+      else
+         vpanic("iselDblExpr(ppc): const");
+
+      if (!mode64) {
+         HReg r_srcHi = newVRegI(env);
+         HReg r_srcLo = newVRegI(env);
+         addInstr(env, PPCInstr_LI(r_srcHi, u.u32x2[0], mode64));
+         addInstr(env, PPCInstr_LI(r_srcLo, u.u32x2[1], mode64));
+         return mk_LoadRR32toFPR( env, r_srcHi, r_srcLo );
+      } else { // mode64
+         HReg r_src = newVRegI(env);
+         addInstr(env, PPCInstr_LI(r_src, u.u64, mode64));
+         return mk_LoadR64toFPR( env, r_src );         // 1*I64 -> F64
+      }
+   }
+
+   /* --------- LOAD --------- */
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_BE) {
+      HReg r_dst = newVRegF(env);
+      PPCAMode* am_addr;
+      vassert(e->Iex.Load.ty == Ity_F64);
+      am_addr = iselWordExpr_AMode(env, e->Iex.Load.addr, Ity_F64/*xfer*/);
+      addInstr(env, PPCInstr_FpLdSt(True/*load*/, 8, r_dst, am_addr));
+      return r_dst;
+   }
+
+   /* --------- GET --------- */
+   if (e->tag == Iex_Get) {
+      HReg r_dst = newVRegF(env);
+      PPCAMode* am_addr = PPCAMode_IR( e->Iex.Get.offset,
+                                       GuestStatePtr(mode64) );
+      addInstr(env, PPCInstr_FpLdSt( True/*load*/, 8, r_dst, am_addr ));
+      return r_dst;
+   }
+
+   /* --------- OPS --------- */
+   if (e->tag == Iex_Qop) {
+      PPCFpOp fpop = Pfp_INVALID;
+      switch (e->Iex.Qop.op) {
+         case Iop_MAddF64:    fpop = Pfp_MADDD; break;
+         case Iop_MAddF64r32: fpop = Pfp_MADDS; break;
+         case Iop_MSubF64:    fpop = Pfp_MSUBD; break;
+         case Iop_MSubF64r32: fpop = Pfp_MSUBS; break;
+         default: break;
+      }
+      if (fpop != Pfp_INVALID) {
+         HReg r_dst  = newVRegF(env);
+         HReg r_srcML  = iselDblExpr(env, e->Iex.Qop.arg2);
+         HReg r_srcMR  = iselDblExpr(env, e->Iex.Qop.arg3);
+         HReg r_srcAcc = iselDblExpr(env, e->Iex.Qop.arg4);
+         set_FPU_rounding_mode( env, e->Iex.Qop.arg1 );
+         addInstr(env, PPCInstr_FpMulAcc(fpop, r_dst, 
+                                               r_srcML, r_srcMR, r_srcAcc));
+         return r_dst;
+      }
+   }
+
+   if (e->tag == Iex_Triop) {
+      PPCFpOp fpop = Pfp_INVALID;
+      switch (e->Iex.Triop.op) {
+         case Iop_AddF64:    fpop = Pfp_ADDD; break;
+         case Iop_SubF64:    fpop = Pfp_SUBD; break;
+         case Iop_MulF64:    fpop = Pfp_MULD; break;
+         case Iop_DivF64:    fpop = Pfp_DIVD; break;
+         case Iop_AddF64r32: fpop = Pfp_ADDS; break;
+         case Iop_SubF64r32: fpop = Pfp_SUBS; break;
+         case Iop_MulF64r32: fpop = Pfp_MULS; break;
+         case Iop_DivF64r32: fpop = Pfp_DIVS; break;
+         default: break;
+      }
+      if (fpop != Pfp_INVALID) {
+         HReg r_dst  = newVRegF(env);
+         HReg r_srcL = iselDblExpr(env, e->Iex.Triop.arg2);
+         HReg r_srcR = iselDblExpr(env, e->Iex.Triop.arg3);
+         set_FPU_rounding_mode( env, e->Iex.Triop.arg1 );
+         addInstr(env, PPCInstr_FpBinary(fpop, r_dst, r_srcL, r_srcR));
+         return r_dst;
+      }
+   }
+
+   if (e->tag == Iex_Binop) {
+      PPCFpOp fpop = Pfp_INVALID;
+      switch (e->Iex.Binop.op) {
+         case Iop_SqrtF64: fpop = Pfp_SQRT; break;
+         default: break;
+      }
+      if (fpop != Pfp_INVALID) {
+         HReg fr_dst = newVRegF(env);
+         HReg fr_src = iselDblExpr(env, e->Iex.Binop.arg2);
+         set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+         addInstr(env, PPCInstr_FpUnary(fpop, fr_dst, fr_src));
+         return fr_dst;
+      }
+   }
+
+   if (e->tag == Iex_Binop) {
+
+      if (e->Iex.Binop.op == Iop_RoundF64toF32) {
+         HReg r_dst = newVRegF(env);
+         HReg r_src = iselDblExpr(env, e->Iex.Binop.arg2);
+         set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+         addInstr(env, PPCInstr_FpRSP(r_dst, r_src));
+         //set_FPU_rounding_default( env );
+         return r_dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_I64StoF64) {
+         if (mode64) {
+            HReg fdst = newVRegF(env);
+            HReg isrc = iselWordExpr_R(env, e->Iex.Binop.arg2);
+            HReg r1   = StackFramePtr(env->mode64);
+            PPCAMode* zero_r1 = PPCAMode_IR( 0, r1 );
+
+            /* Set host rounding mode */
+            set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+
+            sub_from_sp( env, 16 );
+
+            addInstr(env, PPCInstr_Store(8, zero_r1, isrc, True/*mode64*/));
+            addInstr(env, PPCInstr_FpLdSt(True/*load*/, 8, fdst, zero_r1));
+            addInstr(env, PPCInstr_FpCftI(True/*I->F*/, False/*int64*/, 
+                                          fdst, fdst));
+
+            add_to_sp( env, 16 );
+
+            ///* Restore default FPU rounding. */
+            //set_FPU_rounding_default( env );
+            return fdst;
+         } else {
+            /* 32-bit mode */
+            HReg fdst = newVRegF(env);
+            HReg isrcHi, isrcLo;
+            HReg r1   = StackFramePtr(env->mode64);
+            PPCAMode* zero_r1 = PPCAMode_IR( 0, r1 );
+            PPCAMode* four_r1 = PPCAMode_IR( 4, r1 );
+
+            iselInt64Expr(&isrcHi, &isrcLo, env, e->Iex.Binop.arg2);
+
+            /* Set host rounding mode */
+            set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+
+            sub_from_sp( env, 16 );
+
+            addInstr(env, PPCInstr_Store(4, zero_r1, isrcHi, False/*mode32*/));
+            addInstr(env, PPCInstr_Store(4, four_r1, isrcLo, False/*mode32*/));
+            addInstr(env, PPCInstr_FpLdSt(True/*load*/, 8, fdst, zero_r1));
+            addInstr(env, PPCInstr_FpCftI(True/*I->F*/, False/*int64*/, 
+                                          fdst, fdst));
+
+            add_to_sp( env, 16 );
+
+            ///* Restore default FPU rounding. */
+            //set_FPU_rounding_default( env );
+            return fdst;
+         }
+      }
+
+   }
+
+   if (e->tag == Iex_Unop) {
+      PPCFpOp fpop = Pfp_INVALID;
+      switch (e->Iex.Unop.op) {
+         case Iop_NegF64:     fpop = Pfp_NEG; break;
+         case Iop_AbsF64:     fpop = Pfp_ABS; break;
+         case Iop_Est5FRSqrt: fpop = Pfp_RSQRTE; break;
+         case Iop_RoundF64toF64_NegINF:  fpop = Pfp_FRIM; break;
+         case Iop_RoundF64toF64_PosINF:  fpop = Pfp_FRIP; break;
+         case Iop_RoundF64toF64_NEAREST: fpop = Pfp_FRIN; break;
+         case Iop_RoundF64toF64_ZERO:    fpop = Pfp_FRIZ; break;
+         default: break;
+      }
+      if (fpop != Pfp_INVALID) {
+         HReg fr_dst = newVRegF(env);
+         HReg fr_src = iselDblExpr(env, e->Iex.Unop.arg);
+         addInstr(env, PPCInstr_FpUnary(fpop, fr_dst, fr_src));
+         return fr_dst;
+      }
+   }
+
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+         case Iop_ReinterpI64asF64: {
+            /* Given an I64, produce an IEEE754 double with the same
+               bit pattern. */
+            if (!mode64) {
+               HReg r_srcHi, r_srcLo;
+               iselInt64Expr( &r_srcHi, &r_srcLo, env, e->Iex.Unop.arg);
+               return mk_LoadRR32toFPR( env, r_srcHi, r_srcLo );
+            } else {
+               HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+               return mk_LoadR64toFPR( env, r_src );
+            }
+         }
+         case Iop_F32toF64: {
+            /* this is a no-op */
+            HReg res = iselFltExpr(env, e->Iex.Unop.arg);
+            return res;
+         }
+         default: 
+            break;
+      }
+   }
+
+   /* --------- MULTIPLEX --------- */
+   if (e->tag == Iex_Mux0X) {
+      if (ty == Ity_F64
+          && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
+         PPCCondCode cc = mk_PPCCondCode( Pct_TRUE, Pcf_7EQ );
+         HReg r_cond = iselWordExpr_R(env, e->Iex.Mux0X.cond);
+         HReg frX    = iselDblExpr(env, e->Iex.Mux0X.exprX);
+         HReg fr0    = iselDblExpr(env, e->Iex.Mux0X.expr0);
+         HReg fr_dst = newVRegF(env);
+         HReg r_tmp  = newVRegI(env);
+         addInstr(env, PPCInstr_Alu(Palu_AND, r_tmp,
+                                    r_cond, PPCRH_Imm(False,0xFF)));
+         addInstr(env, PPCInstr_FpUnary( Pfp_MOV, fr_dst, frX ));
+         addInstr(env, PPCInstr_Cmp(False/*unsigned*/, True/*32bit cmp*/,
+                                    7/*cr*/, r_tmp, PPCRH_Imm(False,0)));
+         addInstr(env, PPCInstr_FpCMov( cc, fr_dst, fr0 ));
+         return fr_dst;
+      }
+   }
+
+   vex_printf("iselDblExpr(ppc): No such tag(%u)\n", e->tag);
+   ppIRExpr(e);
+   vpanic("iselDblExpr_wrk(ppc)");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
+/*---------------------------------------------------------*/
+
+static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselVecExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcVec128);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   Bool mode64 = env->mode64;
+   PPCAvOp op = Pav_INVALID;
+   IRType  ty = typeOfIRExpr(env->type_env,e);
+   vassert(e);
+   vassert(ty == Ity_V128);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Get) {
+      /* Guest state vectors are 16byte aligned,
+         so don't need to worry here */
+      HReg dst = newVRegV(env);
+      addInstr(env,
+               PPCInstr_AvLdSt( True/*load*/, 16, dst,
+                                PPCAMode_IR( e->Iex.Get.offset,
+                                             GuestStatePtr(mode64) )));
+      return dst;
+   }
+
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_BE) {
+      PPCAMode* am_addr;
+      HReg v_dst = newVRegV(env);
+      vassert(e->Iex.Load.ty == Ity_V128);
+      am_addr = iselWordExpr_AMode(env, e->Iex.Load.addr, Ity_V128/*xfer*/);
+      addInstr(env, PPCInstr_AvLdSt( True/*load*/, 16, v_dst, am_addr));
+      return v_dst;
+   }
+
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+
+      case Iop_NotV128: {
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg dst = newVRegV(env);
+         addInstr(env, PPCInstr_AvUnary(Pav_NOT, dst, arg));
+         return dst;
+      }
+
+      case Iop_CmpNEZ8x16: {
+         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg zero = newVRegV(env);
+         HReg dst  = newVRegV(env);
+         addInstr(env, PPCInstr_AvBinary(Pav_XOR, zero, zero, zero));
+         addInstr(env, PPCInstr_AvBin8x16(Pav_CMPEQU, dst, arg, zero));
+         addInstr(env, PPCInstr_AvUnary(Pav_NOT, dst, dst));
+         return dst;
+      }
+
+      case Iop_CmpNEZ16x8: {
+         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg zero = newVRegV(env);
+         HReg dst  = newVRegV(env);
+         addInstr(env, PPCInstr_AvBinary(Pav_XOR, zero, zero, zero));
+         addInstr(env, PPCInstr_AvBin16x8(Pav_CMPEQU, dst, arg, zero));
+         addInstr(env, PPCInstr_AvUnary(Pav_NOT, dst, dst));
+         return dst;
+      }
+
+      case Iop_CmpNEZ32x4: {
+         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg zero = newVRegV(env);
+         HReg dst  = newVRegV(env);
+         addInstr(env, PPCInstr_AvBinary(Pav_XOR, zero, zero, zero));
+         addInstr(env, PPCInstr_AvBin32x4(Pav_CMPEQU, dst, arg, zero));
+         addInstr(env, PPCInstr_AvUnary(Pav_NOT, dst, dst));
+         return dst;
+      }
+
+      case Iop_Recip32Fx4:    op = Pavfp_RCPF;    goto do_32Fx4_unary;
+      case Iop_RSqrt32Fx4:    op = Pavfp_RSQRTF;  goto do_32Fx4_unary;
+      case Iop_I32UtoFx4:     op = Pavfp_CVTU2F;  goto do_32Fx4_unary;
+      case Iop_I32StoFx4:     op = Pavfp_CVTS2F;  goto do_32Fx4_unary;
+      case Iop_QFtoI32Ux4_RZ: op = Pavfp_QCVTF2U; goto do_32Fx4_unary;
+      case Iop_QFtoI32Sx4_RZ: op = Pavfp_QCVTF2S; goto do_32Fx4_unary;
+      case Iop_RoundF32x4_RM: op = Pavfp_ROUNDM;  goto do_32Fx4_unary;
+      case Iop_RoundF32x4_RP: op = Pavfp_ROUNDP;  goto do_32Fx4_unary;
+      case Iop_RoundF32x4_RN: op = Pavfp_ROUNDN;  goto do_32Fx4_unary;
+      case Iop_RoundF32x4_RZ: op = Pavfp_ROUNDZ;  goto do_32Fx4_unary;
+      do_32Fx4_unary:
+      {
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg dst = newVRegV(env);
+         addInstr(env, PPCInstr_AvUn32Fx4(op, dst, arg));
+         return dst;
+      }
+
+      case Iop_32UtoV128: {
+         HReg r_aligned16, r_zeros;
+         HReg r_src = iselWordExpr_R(env, e->Iex.Unop.arg);
+         HReg   dst = newVRegV(env);
+         PPCAMode *am_off0, *am_off4, *am_off8, *am_off12;
+         sub_from_sp( env, 32 );     // Move SP down
+
+         /* Get a quadword aligned address within our stack space */
+         r_aligned16 = get_sp_aligned16( env );
+         am_off0  = PPCAMode_IR( 0,  r_aligned16 );
+         am_off4  = PPCAMode_IR( 4,  r_aligned16 );
+         am_off8  = PPCAMode_IR( 8,  r_aligned16 );
+         am_off12 = PPCAMode_IR( 12, r_aligned16 );
+
+         /* Store zeros */
+         r_zeros = newVRegI(env);
+         addInstr(env, PPCInstr_LI(r_zeros, 0x0, mode64));
+         addInstr(env, PPCInstr_Store( 4, am_off0, r_zeros, mode64 ));
+         addInstr(env, PPCInstr_Store( 4, am_off4, r_zeros, mode64 ));
+         addInstr(env, PPCInstr_Store( 4, am_off8, r_zeros, mode64 ));
+
+         /* Store r_src in low word of quadword-aligned mem */
+         addInstr(env, PPCInstr_Store( 4, am_off12, r_src, mode64 ));
+
+         /* Load word into low word of quadword vector reg */
+         addInstr(env, PPCInstr_AvLdSt( True/*ld*/, 4, dst, am_off12 ));
+
+         add_to_sp( env, 32 );       // Reset SP
+         return dst;
+      }
+
+      case Iop_Dup8x16:
+      case Iop_Dup16x8:
+      case Iop_Dup32x4:
+         return mk_AvDuplicateRI(env, e->Iex.Binop.arg1);
+
+      default:
+         break;
+      } /* switch (e->Iex.Unop.op) */
+   } /* if (e->tag == Iex_Unop) */
+
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+
+      case Iop_64HLtoV128: {
+         if (!mode64) {
+            HReg     r3, r2, r1, r0, r_aligned16;
+            PPCAMode *am_off0, *am_off4, *am_off8, *am_off12;
+            HReg     dst = newVRegV(env);
+            /* do this via the stack (easy, convenient, etc) */
+            sub_from_sp( env, 32 );        // Move SP down
+            
+            // get a quadword aligned address within our stack space
+            r_aligned16 = get_sp_aligned16( env );
+            am_off0  = PPCAMode_IR( 0,  r_aligned16 );
+            am_off4  = PPCAMode_IR( 4,  r_aligned16 );
+            am_off8  = PPCAMode_IR( 8,  r_aligned16 );
+            am_off12 = PPCAMode_IR( 12, r_aligned16 );
+            
+            /* Do the less significant 64 bits */
+            iselInt64Expr(&r1, &r0, env, e->Iex.Binop.arg2);
+            addInstr(env, PPCInstr_Store( 4, am_off12, r0, mode64 ));
+            addInstr(env, PPCInstr_Store( 4, am_off8,  r1, mode64 ));
+            /* Do the more significant 64 bits */
+            iselInt64Expr(&r3, &r2, env, e->Iex.Binop.arg1);
+            addInstr(env, PPCInstr_Store( 4, am_off4, r2, mode64 ));
+            addInstr(env, PPCInstr_Store( 4, am_off0, r3, mode64 ));
+            
+            /* Fetch result back from stack. */
+            addInstr(env, PPCInstr_AvLdSt(True/*ld*/, 16, dst, am_off0));
+            
+            add_to_sp( env, 32 );          // Reset SP
+            return dst;
+         } else {
+            HReg     rHi = iselWordExpr_R(env, e->Iex.Binop.arg1);
+            HReg     rLo = iselWordExpr_R(env, e->Iex.Binop.arg2);
+            HReg     dst = newVRegV(env);
+            HReg     r_aligned16;
+            PPCAMode *am_off0, *am_off8;
+            /* do this via the stack (easy, convenient, etc) */
+            sub_from_sp( env, 32 );        // Move SP down
+            
+            // get a quadword aligned address within our stack space
+            r_aligned16 = get_sp_aligned16( env );
+            am_off0  = PPCAMode_IR( 0,  r_aligned16 );
+            am_off8  = PPCAMode_IR( 8,  r_aligned16 );
+            
+            /* Store 2*I64 to stack */
+            addInstr(env, PPCInstr_Store( 8, am_off0, rHi, mode64 ));
+            addInstr(env, PPCInstr_Store( 8, am_off8, rLo, mode64 ));
+
+            /* Fetch result back from stack. */
+            addInstr(env, PPCInstr_AvLdSt(True/*ld*/, 16, dst, am_off0));
+            
+            add_to_sp( env, 32 );          // Reset SP
+            return dst;
+         }
+      }
+
+      case Iop_Add32Fx4:   op = Pavfp_ADDF;   goto do_32Fx4;
+      case Iop_Sub32Fx4:   op = Pavfp_SUBF;   goto do_32Fx4;
+      case Iop_Max32Fx4:   op = Pavfp_MAXF;   goto do_32Fx4;
+      case Iop_Min32Fx4:   op = Pavfp_MINF;   goto do_32Fx4;
+      case Iop_Mul32Fx4:   op = Pavfp_MULF;   goto do_32Fx4;
+      case Iop_CmpEQ32Fx4: op = Pavfp_CMPEQF; goto do_32Fx4;
+      case Iop_CmpGT32Fx4: op = Pavfp_CMPGTF; goto do_32Fx4;
+      case Iop_CmpGE32Fx4: op = Pavfp_CMPGEF; goto do_32Fx4;
+      do_32Fx4:
+      {
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         addInstr(env, PPCInstr_AvBin32Fx4(op, dst, argL, argR));
+         return dst;
+      }
+
+      case Iop_CmpLE32Fx4: {
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         
+         /* stay consistent with native ppc compares:
+            if a left/right lane holds a nan, return zeros for that lane
+            so: le == NOT(gt OR isNan)
+          */
+         HReg isNanLR = newVRegV(env);
+         HReg isNanL = isNan(env, argL);
+         HReg isNanR = isNan(env, argR);
+         addInstr(env, PPCInstr_AvBinary(Pav_OR, isNanLR,
+                                         isNanL, isNanR));
+
+         addInstr(env, PPCInstr_AvBin32Fx4(Pavfp_CMPGTF, dst,
+                                           argL, argR));
+         addInstr(env, PPCInstr_AvBinary(Pav_OR, dst, dst, isNanLR));
+         addInstr(env, PPCInstr_AvUnary(Pav_NOT, dst, dst));
+         return dst;
+      }
+
+      case Iop_AndV128:    op = Pav_AND;      goto do_AvBin;
+      case Iop_OrV128:     op = Pav_OR;       goto do_AvBin;
+      case Iop_XorV128:    op = Pav_XOR;      goto do_AvBin;
+      do_AvBin: {
+         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst  = newVRegV(env);
+         addInstr(env, PPCInstr_AvBinary(op, dst, arg1, arg2));
+         return dst;
+      }
+
+      case Iop_Shl8x16:    op = Pav_SHL;    goto do_AvBin8x16;
+      case Iop_Shr8x16:    op = Pav_SHR;    goto do_AvBin8x16;
+      case Iop_Sar8x16:    op = Pav_SAR;    goto do_AvBin8x16;
+      case Iop_Rol8x16:    op = Pav_ROTL;   goto do_AvBin8x16;
+      case Iop_InterleaveHI8x16: op = Pav_MRGHI;  goto do_AvBin8x16;
+      case Iop_InterleaveLO8x16: op = Pav_MRGLO;  goto do_AvBin8x16;
+      case Iop_Add8x16:    op = Pav_ADDU;   goto do_AvBin8x16;
+      case Iop_QAdd8Ux16:  op = Pav_QADDU;  goto do_AvBin8x16;
+      case Iop_QAdd8Sx16:  op = Pav_QADDS;  goto do_AvBin8x16;
+      case Iop_Sub8x16:    op = Pav_SUBU;   goto do_AvBin8x16;
+      case Iop_QSub8Ux16:  op = Pav_QSUBU;  goto do_AvBin8x16;
+      case Iop_QSub8Sx16:  op = Pav_QSUBS;  goto do_AvBin8x16;
+      case Iop_Avg8Ux16:   op = Pav_AVGU;   goto do_AvBin8x16;
+      case Iop_Avg8Sx16:   op = Pav_AVGS;   goto do_AvBin8x16;
+      case Iop_Max8Ux16:   op = Pav_MAXU;   goto do_AvBin8x16;
+      case Iop_Max8Sx16:   op = Pav_MAXS;   goto do_AvBin8x16;
+      case Iop_Min8Ux16:   op = Pav_MINU;   goto do_AvBin8x16;
+      case Iop_Min8Sx16:   op = Pav_MINS;   goto do_AvBin8x16;
+      case Iop_MullEven8Ux16: op = Pav_OMULU;  goto do_AvBin8x16;
+      case Iop_MullEven8Sx16: op = Pav_OMULS;  goto do_AvBin8x16;
+      case Iop_CmpEQ8x16:  op = Pav_CMPEQU; goto do_AvBin8x16;
+      case Iop_CmpGT8Ux16: op = Pav_CMPGTU; goto do_AvBin8x16;
+      case Iop_CmpGT8Sx16: op = Pav_CMPGTS; goto do_AvBin8x16;
+      do_AvBin8x16: {
+         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst  = newVRegV(env);
+         addInstr(env, PPCInstr_AvBin8x16(op, dst, arg1, arg2));
+         return dst;
+      }
+
+      case Iop_Shl16x8:    op = Pav_SHL;    goto do_AvBin16x8;
+      case Iop_Shr16x8:    op = Pav_SHR;    goto do_AvBin16x8;
+      case Iop_Sar16x8:    op = Pav_SAR;    goto do_AvBin16x8;
+      case Iop_Rol16x8:    op = Pav_ROTL;   goto do_AvBin16x8;
+      case Iop_Narrow16x8:       op = Pav_PACKUU;  goto do_AvBin16x8;
+      case Iop_QNarrow16Ux8:     op = Pav_QPACKUU; goto do_AvBin16x8;
+      case Iop_QNarrow16Sx8:     op = Pav_QPACKSS; goto do_AvBin16x8;
+      case Iop_InterleaveHI16x8: op = Pav_MRGHI;  goto do_AvBin16x8;
+      case Iop_InterleaveLO16x8: op = Pav_MRGLO;  goto do_AvBin16x8;
+      case Iop_Add16x8:    op = Pav_ADDU;   goto do_AvBin16x8;
+      case Iop_QAdd16Ux8:  op = Pav_QADDU;  goto do_AvBin16x8;
+      case Iop_QAdd16Sx8:  op = Pav_QADDS;  goto do_AvBin16x8;
+      case Iop_Sub16x8:    op = Pav_SUBU;   goto do_AvBin16x8;
+      case Iop_QSub16Ux8:  op = Pav_QSUBU;  goto do_AvBin16x8;
+      case Iop_QSub16Sx8:  op = Pav_QSUBS;  goto do_AvBin16x8;
+      case Iop_Avg16Ux8:   op = Pav_AVGU;   goto do_AvBin16x8;
+      case Iop_Avg16Sx8:   op = Pav_AVGS;   goto do_AvBin16x8;
+      case Iop_Max16Ux8:   op = Pav_MAXU;   goto do_AvBin16x8;
+      case Iop_Max16Sx8:   op = Pav_MAXS;   goto do_AvBin16x8;
+      case Iop_Min16Ux8:   op = Pav_MINU;   goto do_AvBin16x8;
+      case Iop_Min16Sx8:   op = Pav_MINS;   goto do_AvBin16x8;
+      case Iop_MullEven16Ux8: op = Pav_OMULU;  goto do_AvBin16x8;
+      case Iop_MullEven16Sx8: op = Pav_OMULS;  goto do_AvBin16x8;
+      case Iop_CmpEQ16x8:  op = Pav_CMPEQU; goto do_AvBin16x8;
+      case Iop_CmpGT16Ux8: op = Pav_CMPGTU; goto do_AvBin16x8;
+      case Iop_CmpGT16Sx8: op = Pav_CMPGTS; goto do_AvBin16x8;
+      do_AvBin16x8: {
+         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst  = newVRegV(env);
+         addInstr(env, PPCInstr_AvBin16x8(op, dst, arg1, arg2));
+         return dst;
+      }
+
+      case Iop_Shl32x4:    op = Pav_SHL;    goto do_AvBin32x4;
+      case Iop_Shr32x4:    op = Pav_SHR;    goto do_AvBin32x4;
+      case Iop_Sar32x4:    op = Pav_SAR;    goto do_AvBin32x4;
+      case Iop_Rol32x4:    op = Pav_ROTL;   goto do_AvBin32x4;
+      case Iop_Narrow32x4:       op = Pav_PACKUU;  goto do_AvBin32x4;
+      case Iop_QNarrow32Ux4:     op = Pav_QPACKUU; goto do_AvBin32x4;
+      case Iop_QNarrow32Sx4:     op = Pav_QPACKSS; goto do_AvBin32x4;
+      case Iop_InterleaveHI32x4: op = Pav_MRGHI;  goto do_AvBin32x4;
+      case Iop_InterleaveLO32x4: op = Pav_MRGLO;  goto do_AvBin32x4;
+      case Iop_Add32x4:    op = Pav_ADDU;   goto do_AvBin32x4;
+      case Iop_QAdd32Ux4:  op = Pav_QADDU;  goto do_AvBin32x4;
+      case Iop_QAdd32Sx4:  op = Pav_QADDS;  goto do_AvBin32x4;
+      case Iop_Sub32x4:    op = Pav_SUBU;   goto do_AvBin32x4;
+      case Iop_QSub32Ux4:  op = Pav_QSUBU;  goto do_AvBin32x4;
+      case Iop_QSub32Sx4:  op = Pav_QSUBS;  goto do_AvBin32x4;
+      case Iop_Avg32Ux4:   op = Pav_AVGU;   goto do_AvBin32x4;
+      case Iop_Avg32Sx4:   op = Pav_AVGS;   goto do_AvBin32x4;
+      case Iop_Max32Ux4:   op = Pav_MAXU;   goto do_AvBin32x4;
+      case Iop_Max32Sx4:   op = Pav_MAXS;   goto do_AvBin32x4;
+      case Iop_Min32Ux4:   op = Pav_MINU;   goto do_AvBin32x4;
+      case Iop_Min32Sx4:   op = Pav_MINS;   goto do_AvBin32x4;
+      case Iop_CmpEQ32x4:  op = Pav_CMPEQU; goto do_AvBin32x4;
+      case Iop_CmpGT32Ux4: op = Pav_CMPGTU; goto do_AvBin32x4;
+      case Iop_CmpGT32Sx4: op = Pav_CMPGTS; goto do_AvBin32x4;
+      do_AvBin32x4: {
+         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst  = newVRegV(env);
+         addInstr(env, PPCInstr_AvBin32x4(op, dst, arg1, arg2));
+         return dst;
+      }
+
+      case Iop_ShlN8x16: op = Pav_SHL; goto do_AvShift8x16;
+      case Iop_SarN8x16: op = Pav_SAR; goto do_AvShift8x16;
+      do_AvShift8x16: {
+         HReg r_src  = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg dst    = newVRegV(env);
+         HReg v_shft = mk_AvDuplicateRI(env, e->Iex.Binop.arg2);
+         addInstr(env, PPCInstr_AvBin8x16(op, dst, r_src, v_shft));
+         return dst;
+      }
+
+      case Iop_ShlN16x8: op = Pav_SHL; goto do_AvShift16x8;
+      case Iop_ShrN16x8: op = Pav_SHR; goto do_AvShift16x8;
+      case Iop_SarN16x8: op = Pav_SAR; goto do_AvShift16x8;
+      do_AvShift16x8: {
+         HReg r_src  = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg dst    = newVRegV(env);
+         HReg v_shft = mk_AvDuplicateRI(env, e->Iex.Binop.arg2);
+         addInstr(env, PPCInstr_AvBin16x8(op, dst, r_src, v_shft));
+         return dst;
+      }
+
+      case Iop_ShlN32x4: op = Pav_SHL; goto do_AvShift32x4;
+      case Iop_ShrN32x4: op = Pav_SHR; goto do_AvShift32x4;
+      case Iop_SarN32x4: op = Pav_SAR; goto do_AvShift32x4;
+      do_AvShift32x4: {
+         HReg r_src  = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg dst    = newVRegV(env);
+         HReg v_shft = mk_AvDuplicateRI(env, e->Iex.Binop.arg2);
+         addInstr(env, PPCInstr_AvBin32x4(op, dst, r_src, v_shft));
+         return dst;
+      }
+
+      case Iop_ShrV128: op = Pav_SHR; goto do_AvShiftV128;
+      case Iop_ShlV128: op = Pav_SHL; goto do_AvShiftV128;
+      do_AvShiftV128: {
+         HReg dst    = newVRegV(env);
+         HReg r_src  = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg v_shft = mk_AvDuplicateRI(env, e->Iex.Binop.arg2);
+         /* Note: shift value gets masked by 127 */
+         addInstr(env, PPCInstr_AvBinary(op, dst, r_src, v_shft));
+         return dst;
+      }
+
+      case Iop_Perm8x16: {
+         HReg dst   = newVRegV(env);
+         HReg v_src = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg v_ctl = iselVecExpr(env, e->Iex.Binop.arg2);
+         addInstr(env, PPCInstr_AvPerm(dst, v_src, v_src, v_ctl));
+         return dst;
+      }
+
+      default:
+         break;
+      } /* switch (e->Iex.Binop.op) */
+   } /* if (e->tag == Iex_Binop) */
+
+   if (e->tag == Iex_Const ) {
+      vassert(e->Iex.Const.con->tag == Ico_V128);
+      if (e->Iex.Const.con->Ico.V128 == 0x0000) {
+         return generate_zeroes_V128(env);
+      }
+   }
+
+   vex_printf("iselVecExpr(ppc) (subarch = %s): can't reduce\n",
+              LibVEX_ppVexHwCaps(mode64 ? VexArchPPC64 : VexArchPPC32,
+                                 env->hwcaps));
+   ppIRExpr(e);
+   vpanic("iselVecExpr_wrk(ppc)");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Statements                                  ---*/
+/*---------------------------------------------------------*/
+
+static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+{
+   Bool mode64 = env->mode64;
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      vex_printf("\n -- ");
+      ppIRStmt(stmt);
+      vex_printf("\n");
+   }
+
+   switch (stmt->tag) {
+
+   /* --------- STORE --------- */
+   case Ist_Store: {
+      IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
+      IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
+      IREndness end   = stmt->Ist.Store.end;
+
+      if (end != Iend_BE)
+         goto stmt_fail;
+      if (!mode64 && (tya != Ity_I32))
+         goto stmt_fail;
+      if (mode64 && (tya != Ity_I64))
+         goto stmt_fail;
+
+      if (tyd == Ity_I8 || tyd == Ity_I16 || tyd == Ity_I32 ||
+          (mode64 && (tyd == Ity_I64))) {
+         PPCAMode* am_addr
+            = iselWordExpr_AMode(env, stmt->Ist.Store.addr, tyd/*of xfer*/);
+         HReg r_src = iselWordExpr_R(env, stmt->Ist.Store.data);
+         addInstr(env, PPCInstr_Store( toUChar(sizeofIRType(tyd)), 
+                                       am_addr, r_src, mode64 ));
+         return;
+      }
+      if (tyd == Ity_F64) {
+         PPCAMode* am_addr
+            = iselWordExpr_AMode(env, stmt->Ist.Store.addr, tyd/*of xfer*/);
+         HReg fr_src = iselDblExpr(env, stmt->Ist.Store.data);
+         addInstr(env,
+                  PPCInstr_FpLdSt(False/*store*/, 8, fr_src, am_addr));
+         return;
+      }
+      if (tyd == Ity_F32) {
+         PPCAMode* am_addr
+            = iselWordExpr_AMode(env, stmt->Ist.Store.addr, tyd/*of xfer*/);
+         HReg fr_src = iselFltExpr(env, stmt->Ist.Store.data);
+         addInstr(env,
+                  PPCInstr_FpLdSt(False/*store*/, 4, fr_src, am_addr));
+         return;
+      }
+      if (tyd == Ity_V128) {
+         PPCAMode* am_addr
+            = iselWordExpr_AMode(env, stmt->Ist.Store.addr, tyd/*of xfer*/);
+         HReg v_src = iselVecExpr(env, stmt->Ist.Store.data);
+         addInstr(env,
+                  PPCInstr_AvLdSt(False/*store*/, 16, v_src, am_addr));
+         return;
+      }
+      if (tyd == Ity_I64 && !mode64) {
+         /* Just calculate the address in the register.  Life is too
+            short to arse around trying and possibly failing to adjust
+            the offset in a 'reg+offset' style amode. */
+         HReg rHi32, rLo32;
+         HReg r_addr = iselWordExpr_R(env, stmt->Ist.Store.addr);
+         iselInt64Expr( &rHi32, &rLo32, env, stmt->Ist.Store.data );
+         addInstr(env, PPCInstr_Store( 4/*byte-store*/,
+                                       PPCAMode_IR( 0, r_addr ), 
+                                       rHi32,
+                                       False/*32-bit insn please*/) );
+         addInstr(env, PPCInstr_Store( 4/*byte-store*/, 
+                                       PPCAMode_IR( 4, r_addr ), 
+                                       rLo32,
+                                       False/*32-bit insn please*/) );
+         return;
+      }
+      break;
+   }
+
+   /* --------- PUT --------- */
+   case Ist_Put: {
+      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
+      if (ty == Ity_I8  || ty == Ity_I16 ||
+          ty == Ity_I32 || ((ty == Ity_I64) && mode64)) {
+         HReg r_src = iselWordExpr_R(env, stmt->Ist.Put.data);
+         PPCAMode* am_addr = PPCAMode_IR( stmt->Ist.Put.offset,
+                                          GuestStatePtr(mode64) );
+         addInstr(env, PPCInstr_Store( toUChar(sizeofIRType(ty)), 
+                                       am_addr, r_src, mode64 ));
+         return;
+      }
+      if (!mode64 && ty == Ity_I64) {
+         HReg rHi, rLo;
+         PPCAMode* am_addr  = PPCAMode_IR( stmt->Ist.Put.offset,
+                                           GuestStatePtr(mode64) );
+         PPCAMode* am_addr4 = advance4(env, am_addr);
+         iselInt64Expr(&rHi,&rLo, env, stmt->Ist.Put.data);
+         addInstr(env, PPCInstr_Store( 4, am_addr,  rHi, mode64 ));
+         addInstr(env, PPCInstr_Store( 4, am_addr4, rLo, mode64 ));
+         return;
+     }
+     if (ty == Ity_V128) {
+         /* Guest state vectors are 16byte aligned,
+            so don't need to worry here */
+         HReg v_src = iselVecExpr(env, stmt->Ist.Put.data);
+         PPCAMode* am_addr  = PPCAMode_IR( stmt->Ist.Put.offset,
+                                           GuestStatePtr(mode64) );
+         addInstr(env,
+                  PPCInstr_AvLdSt(False/*store*/, 16, v_src, am_addr));
+         return;
+      }
+      if (ty == Ity_F64) {
+         HReg fr_src = iselDblExpr(env, stmt->Ist.Put.data);
+         PPCAMode* am_addr = PPCAMode_IR( stmt->Ist.Put.offset,
+                                          GuestStatePtr(mode64) );
+         addInstr(env, PPCInstr_FpLdSt( False/*store*/, 8,
+                                        fr_src, am_addr ));
+         return;
+      }
+      break;
+   }
+      
+   /* --------- Indexed PUT --------- */
+   case Ist_PutI: {
+      PPCAMode* dst_am
+         = genGuestArrayOffset(
+              env, stmt->Ist.PutI.descr, 
+                   stmt->Ist.PutI.ix, stmt->Ist.PutI.bias );
+      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.PutI.data);
+      if (mode64 && ty == Ity_I64) {
+         HReg r_src = iselWordExpr_R(env, stmt->Ist.PutI.data);
+         addInstr(env, PPCInstr_Store( toUChar(8),
+                                       dst_am, r_src, mode64 ));
+         return;
+      }
+      if ((!mode64) && ty == Ity_I32) {
+         HReg r_src = iselWordExpr_R(env, stmt->Ist.PutI.data);
+         addInstr(env, PPCInstr_Store( toUChar(4),
+                                       dst_am, r_src, mode64 ));
+         return;
+      }
+      break;
+   }
+
+   /* --------- TMP --------- */
+   case Ist_WrTmp: {
+      IRTemp tmp = stmt->Ist.WrTmp.tmp;
+      IRType ty = typeOfIRTemp(env->type_env, tmp);
+      if (ty == Ity_I8  || ty == Ity_I16 ||
+          ty == Ity_I32 || ((ty == Ity_I64) && mode64)) {
+         HReg r_dst = lookupIRTemp(env, tmp);
+         HReg r_src = iselWordExpr_R(env, stmt->Ist.WrTmp.data);
+         addInstr(env, mk_iMOVds_RR( r_dst, r_src ));
+         return;
+      }
+      if (!mode64 && ty == Ity_I64) {
+         HReg r_srcHi, r_srcLo, r_dstHi, r_dstLo;
+         iselInt64Expr(&r_srcHi,&r_srcLo, env, stmt->Ist.WrTmp.data);
+         lookupIRTempPair( &r_dstHi, &r_dstLo, env, tmp);
+         addInstr(env, mk_iMOVds_RR(r_dstHi, r_srcHi) );
+         addInstr(env, mk_iMOVds_RR(r_dstLo, r_srcLo) );
+         return;
+      }
+      if (mode64 && ty == Ity_I128) {
+         HReg r_srcHi, r_srcLo, r_dstHi, r_dstLo;
+         iselInt128Expr(&r_srcHi,&r_srcLo, env, stmt->Ist.WrTmp.data);
+         lookupIRTempPair( &r_dstHi, &r_dstLo, env, tmp);
+         addInstr(env, mk_iMOVds_RR(r_dstHi, r_srcHi) );
+         addInstr(env, mk_iMOVds_RR(r_dstLo, r_srcLo) );
+         return;
+      }
+      if (ty == Ity_I1) {
+         PPCCondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
+         HReg r_dst = lookupIRTemp(env, tmp);
+         addInstr(env, PPCInstr_Set(cond, r_dst));
+         return;
+      }
+      if (ty == Ity_F64) {
+         HReg fr_dst = lookupIRTemp(env, tmp);
+         HReg fr_src = iselDblExpr(env, stmt->Ist.WrTmp.data);
+         addInstr(env, PPCInstr_FpUnary(Pfp_MOV, fr_dst, fr_src));
+         return;
+      }
+      if (ty == Ity_F32) {
+         HReg fr_dst = lookupIRTemp(env, tmp);
+         HReg fr_src = iselFltExpr(env, stmt->Ist.WrTmp.data);
+         addInstr(env, PPCInstr_FpUnary(Pfp_MOV, fr_dst, fr_src));
+         return;
+      }
+      if (ty == Ity_V128) {
+         HReg v_dst = lookupIRTemp(env, tmp);
+         HReg v_src = iselVecExpr(env, stmt->Ist.WrTmp.data);
+         addInstr(env, PPCInstr_AvUnary(Pav_MOV, v_dst, v_src));
+         return;
+      }
+      break;
+   }
+
+   /* --------- Load Linked or Store Conditional --------- */
+   case Ist_LLSC: {
+      IRTemp res    = stmt->Ist.LLSC.result;
+      IRType tyRes  = typeOfIRTemp(env->type_env, res);
+      IRType tyAddr = typeOfIRExpr(env->type_env, stmt->Ist.LLSC.addr);
+
+      if (stmt->Ist.LLSC.end != Iend_BE)
+         goto stmt_fail;
+      if (!mode64 && (tyAddr != Ity_I32))
+         goto stmt_fail;
+      if (mode64 && (tyAddr != Ity_I64))
+         goto stmt_fail;
+
+      if (stmt->Ist.LLSC.storedata == NULL) {
+         /* LL */
+         HReg r_addr = iselWordExpr_R( env, stmt->Ist.LLSC.addr );
+         HReg r_dst  = lookupIRTemp(env, res);
+         if (tyRes == Ity_I32) {
+            addInstr(env, PPCInstr_LoadL( 4, r_dst, r_addr, mode64 ));
+            return;
+         }
+         if (tyRes == Ity_I64 && mode64) {
+            addInstr(env, PPCInstr_LoadL( 8, r_dst, r_addr, mode64 ));
+            return;
+         }
+         /* fallthru */;
+      } else {
+         /* SC */
+         HReg   r_res  = lookupIRTemp(env, res); /* :: Ity_I1 */
+         HReg   r_a    = iselWordExpr_R(env, stmt->Ist.LLSC.addr);
+         HReg   r_src  = iselWordExpr_R(env, stmt->Ist.LLSC.storedata);
+         HReg   r_tmp  = newVRegI(env);
+         IRType tyData = typeOfIRExpr(env->type_env,
+                                      stmt->Ist.LLSC.storedata);
+         vassert(tyRes == Ity_I1);
+         if (tyData == Ity_I32 || (tyData == Ity_I64 && mode64)) {
+            addInstr(env, PPCInstr_StoreC( tyData==Ity_I32 ? 4 : 8,
+                                           r_a, r_src, mode64 ));
+            addInstr(env, PPCInstr_MfCR( r_tmp ));
+            addInstr(env, PPCInstr_Shft(
+                             Pshft_SHR,
+                             env->mode64 ? False : True
+                                /*F:64-bit, T:32-bit shift*/,
+                             r_tmp, r_tmp, 
+                             PPCRH_Imm(False/*unsigned*/, 29)));
+            /* Probably unnecessary, since the IR dest type is Ity_I1,
+               and so we are entitled to leave whatever junk we like
+               drifting round in the upper 31 or 63 bits of r_res.
+               However, for the sake of conservativeness .. */
+            addInstr(env, PPCInstr_Alu(
+                             Palu_AND, 
+                             r_res, r_tmp, 
+                             PPCRH_Imm(False/*signed*/, 1)));
+            return;
+         }
+         /* fallthru */
+      }
+      goto stmt_fail;
+      /*NOTREACHED*/
+   }
+
+   /* --------- Call to DIRTY helper --------- */
+   case Ist_Dirty: {
+      IRType   retty;
+      IRDirty* d = stmt->Ist.Dirty.details;
+      Bool     passBBP = False;
+
+      if (d->nFxState == 0)
+         vassert(!d->needsBBP);
+      passBBP = toBool(d->nFxState > 0 && d->needsBBP);
+
+      /* Marshal args, do the call, clear stack. */
+      doHelperCall( env, passBBP, d->guard, d->cee, d->args );
+
+      /* Now figure out what to do with the returned value, if any. */
+      if (d->tmp == IRTemp_INVALID)
+         /* No return value.  Nothing to do. */
+         return;
+
+      retty = typeOfIRTemp(env->type_env, d->tmp);
+      if (!mode64 && retty == Ity_I64) {
+         HReg r_dstHi, r_dstLo;
+         /* The returned value is in %r3:%r4.  Park it in the
+            register-pair associated with tmp. */
+         lookupIRTempPair( &r_dstHi, &r_dstLo, env, d->tmp);
+         addInstr(env, mk_iMOVds_RR(r_dstHi, hregPPC_GPR3(mode64)));
+         addInstr(env, mk_iMOVds_RR(r_dstLo, hregPPC_GPR4(mode64)));
+         return;
+      }
+      if (retty == Ity_I8  || retty == Ity_I16 ||
+          retty == Ity_I32 || ((retty == Ity_I64) && mode64)) {
+         /* The returned value is in %r3.  Park it in the register
+            associated with tmp. */
+         HReg r_dst = lookupIRTemp(env, d->tmp);
+         addInstr(env, mk_iMOVds_RR(r_dst, hregPPC_GPR3(mode64)));
+         return;
+      }
+      break;
+   }
+
+   /* --------- MEM FENCE --------- */
+   case Ist_MBE:
+      switch (stmt->Ist.MBE.event) {
+         case Imbe_Fence:
+            addInstr(env, PPCInstr_MFence());
+            return;
+         default:
+            break;
+      }
+      break;
+
+   /* --------- INSTR MARK --------- */
+   /* Doesn't generate any executable code ... */
+   case Ist_IMark:
+       return;
+
+   /* --------- ABI HINT --------- */
+   /* These have no meaning (denotation in the IR) and so we ignore
+      them ... if any actually made it this far. */
+   case Ist_AbiHint:
+       return;
+
+   /* --------- NO-OP --------- */
+   /* Fairly self-explanatory, wouldn't you say? */
+   case Ist_NoOp:
+       return;
+
+   /* --------- EXIT --------- */
+   case Ist_Exit: {
+      PPCRI*      ri_dst;
+      PPCCondCode cc;
+      IRConstTag tag = stmt->Ist.Exit.dst->tag;
+      if (!mode64 && (tag != Ico_U32))
+         vpanic("iselStmt(ppc): Ist_Exit: dst is not a 32-bit value");
+      if (mode64 && (tag != Ico_U64))
+         vpanic("iselStmt(ppc64): Ist_Exit: dst is not a 64-bit value");
+      ri_dst = iselWordExpr_RI(env, IRExpr_Const(stmt->Ist.Exit.dst));
+      cc     = iselCondCode(env,stmt->Ist.Exit.guard);
+      addInstr(env, PPCInstr_RdWrLR(True, env->savedLR));
+      addInstr(env, PPCInstr_Goto(stmt->Ist.Exit.jk, cc, ri_dst));
+      return;
+   }
+
+   default: break;
+   }
+  stmt_fail:
+   ppIRStmt(stmt);
+   vpanic("iselStmt(ppc)");
+}
+ 
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Basic block terminators (Nexts)             ---*/
+/*---------------------------------------------------------*/
+
+static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
+{
+   PPCCondCode cond;
+   PPCRI* ri;
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      vex_printf("\n-- goto {");
+      ppIRJumpKind(jk);
+      vex_printf("} ");
+      ppIRExpr(next);
+      vex_printf("\n");
+   }
+   cond = mk_PPCCondCode( Pct_ALWAYS, Pcf_7EQ );
+   ri = iselWordExpr_RI(env, next);
+   addInstr(env, PPCInstr_RdWrLR(True, env->savedLR));
+   addInstr(env, PPCInstr_Goto(jk, cond, ri));
+}
+
+
+/*---------------------------------------------------------*/
+/*--- Insn selector top-level                           ---*/
+/*---------------------------------------------------------*/
+
+/* Translate an entire BS to ppc code. */
+
+HInstrArray* iselSB_PPC ( IRSB* bb, VexArch      arch_host,
+                                    VexArchInfo* archinfo_host,
+                                    VexAbiInfo*  vbi )
+{
+   Int      i, j;
+   HReg     hreg, hregHI;
+   ISelEnv* env;
+   UInt     hwcaps_host = archinfo_host->hwcaps;
+   Bool     mode64 = False;
+   UInt     mask32, mask64;
+
+   vassert(arch_host == VexArchPPC32 || arch_host == VexArchPPC64);
+   mode64 = arch_host == VexArchPPC64;
+
+   /* do some sanity checks */
+   mask32 = VEX_HWCAPS_PPC32_F | VEX_HWCAPS_PPC32_V
+            | VEX_HWCAPS_PPC32_FX | VEX_HWCAPS_PPC32_GX;
+
+   mask64 = VEX_HWCAPS_PPC64_V
+            | VEX_HWCAPS_PPC64_FX | VEX_HWCAPS_PPC64_GX;
+
+   if (mode64) {
+      vassert((hwcaps_host & mask32) == 0);
+   } else {
+      vassert((hwcaps_host & mask64) == 0);
+   }
+
+   /* Make up an initial environment to use. */
+   env = LibVEX_Alloc(sizeof(ISelEnv));
+   env->vreg_ctr = 0;
+
+   /* Are we being ppc32 or ppc64? */
+   env->mode64 = mode64;
+
+   /* Set up output code array. */
+   env->code = newHInstrArray();
+
+   /* Copy BB's type env. */
+   env->type_env = bb->tyenv;
+
+   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
+      change as we go along. */
+   env->n_vregmap = bb->tyenv->types_used;
+   env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
+   env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
+
+   /* and finally ... */
+   env->hwcaps      = hwcaps_host;
+   env->previous_rm = NULL;
+   env->vbi         = vbi;
+
+   /* For each IR temporary, allocate a suitably-kinded virtual
+      register. */
+   j = 0;
+   for (i = 0; i < env->n_vregmap; i++) {
+      hregHI = hreg = INVALID_HREG;
+      switch (bb->tyenv->types[i]) {
+      case Ity_I1:
+      case Ity_I8:
+      case Ity_I16:
+      case Ity_I32:
+         if (mode64) { hreg   = mkHReg(j++, HRcInt64,  True); break;
+         } else {      hreg   = mkHReg(j++, HRcInt32,  True); break;
+         }
+      case Ity_I64:  
+         if (mode64) { hreg   = mkHReg(j++, HRcInt64,  True); break;
+         } else {      hreg   = mkHReg(j++, HRcInt32,  True);
+                       hregHI = mkHReg(j++, HRcInt32,  True); break;
+         }
+      case Ity_I128:   vassert(mode64);
+                       hreg   = mkHReg(j++, HRcInt64,  True);
+                       hregHI = mkHReg(j++, HRcInt64,  True); break;
+      case Ity_F32:
+      case Ity_F64:    hreg   = mkHReg(j++, HRcFlt64,  True); break;
+      case Ity_V128:   hreg   = mkHReg(j++, HRcVec128, True); break;
+      default:
+         ppIRType(bb->tyenv->types[i]);
+         vpanic("iselBB(ppc): IRTemp type");
+      }
+      env->vregmap[i]   = hreg;
+      env->vregmapHI[i] = hregHI;
+   }
+   env->vreg_ctr = j;
+
+   /* Keep a copy of the link reg, so helper functions don't kill it. */
+   env->savedLR = newVRegI(env);
+   addInstr(env, PPCInstr_RdWrLR(False, env->savedLR));
+
+   /* Ok, finally we can iterate over the statements. */
+   for (i = 0; i < bb->stmts_used; i++)
+      if (bb->stmts[i])
+         iselStmt(env,bb->stmts[i]);
+
+   iselNext(env,bb->next,bb->jumpkind);
+
+   /* record the number of vregs we used. */
+   env->code->n_vregs = env->vreg_ctr;
+   return env->code;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                     host_ppc_isel.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_x86_defs.c b/VEX/priv/host_x86_defs.c
new file mode 100644
index 0000000..9a6d651
--- /dev/null
+++ b/VEX/priv/host_x86_defs.c

@@ -0,0 +1,3098 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                   host_x86_defs.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex.h"
+#include "libvex_trc_values.h"
+
+#include "main_util.h"
+#include "host_generic_regs.h"
+#include "host_x86_defs.h"
+
+
+/* --------- Registers. --------- */
+
+void ppHRegX86 ( HReg reg ) 
+{
+   Int r;
+   static HChar* ireg32_names[8] 
+     = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi" };
+   /* Be generic for all virtual regs. */
+   if (hregIsVirtual(reg)) {
+      ppHReg(reg);
+      return;
+   }
+   /* But specific for real regs. */
+   switch (hregClass(reg)) {
+      case HRcInt32:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 8);
+         vex_printf("%s", ireg32_names[r]);
+         return;
+      case HRcFlt64:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 6);
+         vex_printf("%%fake%d", r);
+         return;
+      case HRcVec128:
+         r = hregNumber(reg);
+         vassert(r >= 0 && r < 8);
+         vex_printf("%%xmm%d", r);
+         return;
+      default:
+         vpanic("ppHRegX86");
+   }
+}
+
+HReg hregX86_EAX ( void ) { return mkHReg(0, HRcInt32, False); }
+HReg hregX86_ECX ( void ) { return mkHReg(1, HRcInt32, False); }
+HReg hregX86_EDX ( void ) { return mkHReg(2, HRcInt32, False); }
+HReg hregX86_EBX ( void ) { return mkHReg(3, HRcInt32, False); }
+HReg hregX86_ESP ( void ) { return mkHReg(4, HRcInt32, False); }
+HReg hregX86_EBP ( void ) { return mkHReg(5, HRcInt32, False); }
+HReg hregX86_ESI ( void ) { return mkHReg(6, HRcInt32, False); }
+HReg hregX86_EDI ( void ) { return mkHReg(7, HRcInt32, False); }
+
+HReg hregX86_FAKE0 ( void ) { return mkHReg(0, HRcFlt64, False); }
+HReg hregX86_FAKE1 ( void ) { return mkHReg(1, HRcFlt64, False); }
+HReg hregX86_FAKE2 ( void ) { return mkHReg(2, HRcFlt64, False); }
+HReg hregX86_FAKE3 ( void ) { return mkHReg(3, HRcFlt64, False); }
+HReg hregX86_FAKE4 ( void ) { return mkHReg(4, HRcFlt64, False); }
+HReg hregX86_FAKE5 ( void ) { return mkHReg(5, HRcFlt64, False); }
+
+HReg hregX86_XMM0 ( void ) { return mkHReg(0, HRcVec128, False); }
+HReg hregX86_XMM1 ( void ) { return mkHReg(1, HRcVec128, False); }
+HReg hregX86_XMM2 ( void ) { return mkHReg(2, HRcVec128, False); }
+HReg hregX86_XMM3 ( void ) { return mkHReg(3, HRcVec128, False); }
+HReg hregX86_XMM4 ( void ) { return mkHReg(4, HRcVec128, False); }
+HReg hregX86_XMM5 ( void ) { return mkHReg(5, HRcVec128, False); }
+HReg hregX86_XMM6 ( void ) { return mkHReg(6, HRcVec128, False); }
+HReg hregX86_XMM7 ( void ) { return mkHReg(7, HRcVec128, False); }
+
+
+void getAllocableRegs_X86 ( Int* nregs, HReg** arr )
+{
+   *nregs = 20;
+   *arr = LibVEX_Alloc(*nregs * sizeof(HReg));
+   (*arr)[0] = hregX86_EAX();
+   (*arr)[1] = hregX86_EBX();
+   (*arr)[2] = hregX86_ECX();
+   (*arr)[3] = hregX86_EDX();
+   (*arr)[4] = hregX86_ESI();
+   (*arr)[5] = hregX86_EDI();
+   (*arr)[6] = hregX86_FAKE0();
+   (*arr)[7] = hregX86_FAKE1();
+   (*arr)[8] = hregX86_FAKE2();
+   (*arr)[9] = hregX86_FAKE3();
+   (*arr)[10] = hregX86_FAKE4();
+   (*arr)[11] = hregX86_FAKE5();
+   (*arr)[12] = hregX86_XMM0();
+   (*arr)[13] = hregX86_XMM1();
+   (*arr)[14] = hregX86_XMM2();
+   (*arr)[15] = hregX86_XMM3();
+   (*arr)[16] = hregX86_XMM4();
+   (*arr)[17] = hregX86_XMM5();
+   (*arr)[18] = hregX86_XMM6();
+   (*arr)[19] = hregX86_XMM7();
+}
+
+
+/* --------- Condition codes, Intel encoding. --------- */
+
+HChar* showX86CondCode ( X86CondCode cond )
+{
+   switch (cond) {
+      case Xcc_O:      return "o";
+      case Xcc_NO:     return "no";
+      case Xcc_B:      return "b";
+      case Xcc_NB:     return "nb";
+      case Xcc_Z:      return "z";
+      case Xcc_NZ:     return "nz";
+      case Xcc_BE:     return "be";
+      case Xcc_NBE:    return "nbe";
+      case Xcc_S:      return "s";
+      case Xcc_NS:     return "ns";
+      case Xcc_P:      return "p";
+      case Xcc_NP:     return "np";
+      case Xcc_L:      return "l";
+      case Xcc_NL:     return "nl";
+      case Xcc_LE:     return "le";
+      case Xcc_NLE:    return "nle";
+      case Xcc_ALWAYS: return "ALWAYS";
+      default: vpanic("ppX86CondCode");
+   }
+}
+
+
+/* --------- X86AMode: memory address expressions. --------- */
+
+X86AMode* X86AMode_IR ( UInt imm32, HReg reg ) {
+   X86AMode* am = LibVEX_Alloc(sizeof(X86AMode));
+   am->tag = Xam_IR;
+   am->Xam.IR.imm = imm32;
+   am->Xam.IR.reg = reg;
+   return am;
+}
+X86AMode* X86AMode_IRRS ( UInt imm32, HReg base, HReg indEx, Int shift ) {
+   X86AMode* am = LibVEX_Alloc(sizeof(X86AMode));
+   am->tag = Xam_IRRS;
+   am->Xam.IRRS.imm = imm32;
+   am->Xam.IRRS.base = base;
+   am->Xam.IRRS.index = indEx;
+   am->Xam.IRRS.shift = shift;
+   vassert(shift >= 0 && shift <= 3);
+   return am;
+}
+
+X86AMode* dopyX86AMode ( X86AMode* am ) {
+   switch (am->tag) {
+      case Xam_IR: 
+         return X86AMode_IR( am->Xam.IR.imm, am->Xam.IR.reg );
+      case Xam_IRRS: 
+         return X86AMode_IRRS( am->Xam.IRRS.imm, am->Xam.IRRS.base, 
+                               am->Xam.IRRS.index, am->Xam.IRRS.shift );
+      default:
+         vpanic("dopyX86AMode");
+   }
+}
+
+void ppX86AMode ( X86AMode* am ) {
+   switch (am->tag) {
+      case Xam_IR: 
+         if (am->Xam.IR.imm == 0)
+            vex_printf("(");
+         else
+            vex_printf("0x%x(", am->Xam.IR.imm);
+         ppHRegX86(am->Xam.IR.reg);
+         vex_printf(")");
+         return;
+      case Xam_IRRS:
+         vex_printf("0x%x(", am->Xam.IRRS.imm);
+         ppHRegX86(am->Xam.IRRS.base);
+         vex_printf(",");
+         ppHRegX86(am->Xam.IRRS.index);
+         vex_printf(",%d)", 1 << am->Xam.IRRS.shift);
+         return;
+      default:
+         vpanic("ppX86AMode");
+   }
+}
+
+static void addRegUsage_X86AMode ( HRegUsage* u, X86AMode* am ) {
+   switch (am->tag) {
+      case Xam_IR: 
+         addHRegUse(u, HRmRead, am->Xam.IR.reg);
+         return;
+      case Xam_IRRS:
+         addHRegUse(u, HRmRead, am->Xam.IRRS.base);
+         addHRegUse(u, HRmRead, am->Xam.IRRS.index);
+         return;
+      default:
+         vpanic("addRegUsage_X86AMode");
+   }
+}
+
+static void mapRegs_X86AMode ( HRegRemap* m, X86AMode* am ) {
+   switch (am->tag) {
+      case Xam_IR: 
+         am->Xam.IR.reg = lookupHRegRemap(m, am->Xam.IR.reg);
+         return;
+      case Xam_IRRS:
+         am->Xam.IRRS.base = lookupHRegRemap(m, am->Xam.IRRS.base);
+         am->Xam.IRRS.index = lookupHRegRemap(m, am->Xam.IRRS.index);
+         return;
+      default:
+         vpanic("mapRegs_X86AMode");
+   }
+}
+
+/* --------- Operand, which can be reg, immediate or memory. --------- */
+
+X86RMI* X86RMI_Imm ( UInt imm32 ) {
+   X86RMI* op         = LibVEX_Alloc(sizeof(X86RMI));
+   op->tag            = Xrmi_Imm;
+   op->Xrmi.Imm.imm32 = imm32;
+   return op;
+}
+X86RMI* X86RMI_Reg ( HReg reg ) {
+   X86RMI* op       = LibVEX_Alloc(sizeof(X86RMI));
+   op->tag          = Xrmi_Reg;
+   op->Xrmi.Reg.reg = reg;
+   return op;
+}
+X86RMI* X86RMI_Mem ( X86AMode* am ) {
+   X86RMI* op      = LibVEX_Alloc(sizeof(X86RMI));
+   op->tag         = Xrmi_Mem;
+   op->Xrmi.Mem.am = am;
+   return op;
+}
+
+void ppX86RMI ( X86RMI* op ) {
+   switch (op->tag) {
+      case Xrmi_Imm: 
+         vex_printf("$0x%x", op->Xrmi.Imm.imm32);
+         return;
+      case Xrmi_Reg: 
+         ppHRegX86(op->Xrmi.Reg.reg);
+         return;
+      case Xrmi_Mem: 
+         ppX86AMode(op->Xrmi.Mem.am);
+         return;
+     default: 
+         vpanic("ppX86RMI");
+   }
+}
+
+/* An X86RMI can only be used in a "read" context (what would it mean
+   to write or modify a literal?) and so we enumerate its registers
+   accordingly. */
+static void addRegUsage_X86RMI ( HRegUsage* u, X86RMI* op ) {
+   switch (op->tag) {
+      case Xrmi_Imm: 
+         return;
+      case Xrmi_Reg: 
+         addHRegUse(u, HRmRead, op->Xrmi.Reg.reg);
+         return;
+      case Xrmi_Mem: 
+         addRegUsage_X86AMode(u, op->Xrmi.Mem.am);
+         return;
+      default: 
+         vpanic("addRegUsage_X86RMI");
+   }
+}
+
+static void mapRegs_X86RMI ( HRegRemap* m, X86RMI* op ) {
+   switch (op->tag) {
+      case Xrmi_Imm: 
+         return;
+      case Xrmi_Reg: 
+         op->Xrmi.Reg.reg = lookupHRegRemap(m, op->Xrmi.Reg.reg);
+         return;
+      case Xrmi_Mem: 
+         mapRegs_X86AMode(m, op->Xrmi.Mem.am);
+         return;
+      default: 
+         vpanic("mapRegs_X86RMI");
+   }
+}
+
+
+/* --------- Operand, which can be reg or immediate only. --------- */
+
+X86RI* X86RI_Imm ( UInt imm32 ) {
+   X86RI* op         = LibVEX_Alloc(sizeof(X86RI));
+   op->tag           = Xri_Imm;
+   op->Xri.Imm.imm32 = imm32;
+   return op;
+}
+X86RI* X86RI_Reg ( HReg reg ) {
+   X86RI* op       = LibVEX_Alloc(sizeof(X86RI));
+   op->tag         = Xri_Reg;
+   op->Xri.Reg.reg = reg;
+   return op;
+}
+
+void ppX86RI ( X86RI* op ) {
+   switch (op->tag) {
+      case Xri_Imm: 
+         vex_printf("$0x%x", op->Xri.Imm.imm32);
+         return;
+      case Xri_Reg: 
+         ppHRegX86(op->Xri.Reg.reg);
+         return;
+     default: 
+         vpanic("ppX86RI");
+   }
+}
+
+/* An X86RI can only be used in a "read" context (what would it mean
+   to write or modify a literal?) and so we enumerate its registers
+   accordingly. */
+static void addRegUsage_X86RI ( HRegUsage* u, X86RI* op ) {
+   switch (op->tag) {
+      case Xri_Imm: 
+         return;
+      case Xri_Reg: 
+         addHRegUse(u, HRmRead, op->Xri.Reg.reg);
+         return;
+      default: 
+         vpanic("addRegUsage_X86RI");
+   }
+}
+
+static void mapRegs_X86RI ( HRegRemap* m, X86RI* op ) {
+   switch (op->tag) {
+      case Xri_Imm: 
+         return;
+      case Xri_Reg: 
+         op->Xri.Reg.reg = lookupHRegRemap(m, op->Xri.Reg.reg);
+         return;
+      default: 
+         vpanic("mapRegs_X86RI");
+   }
+}
+
+
+/* --------- Operand, which can be reg or memory only. --------- */
+
+X86RM* X86RM_Reg ( HReg reg ) {
+   X86RM* op       = LibVEX_Alloc(sizeof(X86RM));
+   op->tag         = Xrm_Reg;
+   op->Xrm.Reg.reg = reg;
+   return op;
+}
+X86RM* X86RM_Mem ( X86AMode* am ) {
+   X86RM* op      = LibVEX_Alloc(sizeof(X86RM));
+   op->tag        = Xrm_Mem;
+   op->Xrm.Mem.am = am;
+   return op;
+}
+
+void ppX86RM ( X86RM* op ) {
+   switch (op->tag) {
+      case Xrm_Mem: 
+         ppX86AMode(op->Xrm.Mem.am);
+         return;
+      case Xrm_Reg: 
+         ppHRegX86(op->Xrm.Reg.reg);
+         return;
+     default: 
+         vpanic("ppX86RM");
+   }
+}
+
+/* Because an X86RM can be both a source or destination operand, we
+   have to supply a mode -- pertaining to the operand as a whole --
+   indicating how it's being used. */
+static void addRegUsage_X86RM ( HRegUsage* u, X86RM* op, HRegMode mode ) {
+   switch (op->tag) {
+      case Xrm_Mem: 
+         /* Memory is read, written or modified.  So we just want to
+            know the regs read by the amode. */
+         addRegUsage_X86AMode(u, op->Xrm.Mem.am);
+         return;
+      case Xrm_Reg: 
+         /* reg is read, written or modified.  Add it in the
+            appropriate way. */
+         addHRegUse(u, mode, op->Xrm.Reg.reg);
+         return;
+     default: 
+         vpanic("addRegUsage_X86RM");
+   }
+}
+
+static void mapRegs_X86RM ( HRegRemap* m, X86RM* op )
+{
+   switch (op->tag) {
+      case Xrm_Mem: 
+         mapRegs_X86AMode(m, op->Xrm.Mem.am);
+         return;
+      case Xrm_Reg: 
+         op->Xrm.Reg.reg = lookupHRegRemap(m, op->Xrm.Reg.reg);
+         return;
+     default: 
+         vpanic("mapRegs_X86RM");
+   }
+}
+
+
+/* --------- Instructions. --------- */
+
+HChar* showX86UnaryOp ( X86UnaryOp op ) {
+   switch (op) {
+      case Xun_NOT: return "not";
+      case Xun_NEG: return "neg";
+      default: vpanic("showX86UnaryOp");
+   }
+}
+
+HChar* showX86AluOp ( X86AluOp op ) {
+   switch (op) {
+      case Xalu_MOV:  return "mov";
+      case Xalu_CMP:  return "cmp";
+      case Xalu_ADD:  return "add";
+      case Xalu_SUB:  return "sub";
+      case Xalu_ADC:  return "adc";
+      case Xalu_SBB:  return "sbb";
+      case Xalu_AND:  return "and";
+      case Xalu_OR:   return "or";
+      case Xalu_XOR:  return "xor";
+      case Xalu_MUL:  return "mul";
+      default: vpanic("showX86AluOp");
+   }
+}
+
+HChar* showX86ShiftOp ( X86ShiftOp op ) {
+   switch (op) {
+      case Xsh_SHL: return "shl";
+      case Xsh_SHR: return "shr";
+      case Xsh_SAR: return "sar";
+      default: vpanic("showX86ShiftOp");
+   }
+}
+
+HChar* showX86FpOp ( X86FpOp op ) {
+   switch (op) {
+      case Xfp_ADD:    return "add";
+      case Xfp_SUB:    return "sub";
+      case Xfp_MUL:    return "mul";
+      case Xfp_DIV:    return "div";
+      case Xfp_SCALE:  return "scale";
+      case Xfp_ATAN:   return "atan";
+      case Xfp_YL2X:   return "yl2x";
+      case Xfp_YL2XP1: return "yl2xp1";
+      case Xfp_PREM:   return "prem";
+      case Xfp_PREM1:  return "prem1";
+      case Xfp_SQRT:   return "sqrt";
+      case Xfp_ABS:    return "abs";
+      case Xfp_NEG:    return "chs";
+      case Xfp_MOV:    return "mov";
+      case Xfp_SIN:    return "sin";
+      case Xfp_COS:    return "cos";
+      case Xfp_TAN:    return "tan";
+      case Xfp_ROUND:  return "round";
+      case Xfp_2XM1:   return "2xm1";
+      default: vpanic("showX86FpOp");
+   }
+}
+
+HChar* showX86SseOp ( X86SseOp op ) {
+   switch (op) {
+      case Xsse_MOV:      return "mov(?!)";
+      case Xsse_ADDF:     return "add";
+      case Xsse_SUBF:     return "sub";
+      case Xsse_MULF:     return "mul";
+      case Xsse_DIVF:     return "div";
+      case Xsse_MAXF:     return "max";
+      case Xsse_MINF:     return "min";
+      case Xsse_CMPEQF:   return "cmpFeq";
+      case Xsse_CMPLTF:   return "cmpFlt";
+      case Xsse_CMPLEF:   return "cmpFle";
+      case Xsse_CMPUNF:   return "cmpFun";
+      case Xsse_RCPF:     return "rcp";
+      case Xsse_RSQRTF:   return "rsqrt";
+      case Xsse_SQRTF:    return "sqrt";
+      case Xsse_AND:      return "and";
+      case Xsse_OR:       return "or";
+      case Xsse_XOR:      return "xor";
+      case Xsse_ANDN:     return "andn";
+      case Xsse_ADD8:     return "paddb";
+      case Xsse_ADD16:    return "paddw";
+      case Xsse_ADD32:    return "paddd";
+      case Xsse_ADD64:    return "paddq";
+      case Xsse_QADD8U:   return "paddusb";
+      case Xsse_QADD16U:  return "paddusw";
+      case Xsse_QADD8S:   return "paddsb";
+      case Xsse_QADD16S:  return "paddsw";
+      case Xsse_SUB8:     return "psubb";
+      case Xsse_SUB16:    return "psubw";
+      case Xsse_SUB32:    return "psubd";
+      case Xsse_SUB64:    return "psubq";
+      case Xsse_QSUB8U:   return "psubusb";
+      case Xsse_QSUB16U:  return "psubusw";
+      case Xsse_QSUB8S:   return "psubsb";
+      case Xsse_QSUB16S:  return "psubsw";
+      case Xsse_MUL16:    return "pmullw";
+      case Xsse_MULHI16U: return "pmulhuw";
+      case Xsse_MULHI16S: return "pmulhw";
+      case Xsse_AVG8U:    return "pavgb";
+      case Xsse_AVG16U:   return "pavgw";
+      case Xsse_MAX16S:   return "pmaxw";
+      case Xsse_MAX8U:    return "pmaxub";
+      case Xsse_MIN16S:   return "pminw";
+      case Xsse_MIN8U:    return "pminub";
+      case Xsse_CMPEQ8:   return "pcmpeqb";
+      case Xsse_CMPEQ16:  return "pcmpeqw";
+      case Xsse_CMPEQ32:  return "pcmpeqd";
+      case Xsse_CMPGT8S:  return "pcmpgtb";
+      case Xsse_CMPGT16S: return "pcmpgtw";
+      case Xsse_CMPGT32S: return "pcmpgtd";
+      case Xsse_SHL16:    return "psllw";
+      case Xsse_SHL32:    return "pslld";
+      case Xsse_SHL64:    return "psllq";
+      case Xsse_SHR16:    return "psrlw";
+      case Xsse_SHR32:    return "psrld";
+      case Xsse_SHR64:    return "psrlq";
+      case Xsse_SAR16:    return "psraw";
+      case Xsse_SAR32:    return "psrad";
+      case Xsse_PACKSSD:  return "packssdw";
+      case Xsse_PACKSSW:  return "packsswb";
+      case Xsse_PACKUSW:  return "packuswb";
+      case Xsse_UNPCKHB:  return "punpckhb";
+      case Xsse_UNPCKHW:  return "punpckhw";
+      case Xsse_UNPCKHD:  return "punpckhd";
+      case Xsse_UNPCKHQ:  return "punpckhq";
+      case Xsse_UNPCKLB:  return "punpcklb";
+      case Xsse_UNPCKLW:  return "punpcklw";
+      case Xsse_UNPCKLD:  return "punpckld";
+      case Xsse_UNPCKLQ:  return "punpcklq";
+      default: vpanic("showX86SseOp");
+   }
+}
+
+X86Instr* X86Instr_Alu32R ( X86AluOp op, X86RMI* src, HReg dst ) {
+   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag            = Xin_Alu32R;
+   i->Xin.Alu32R.op  = op;
+   i->Xin.Alu32R.src = src;
+   i->Xin.Alu32R.dst = dst;
+   return i;
+}
+X86Instr* X86Instr_Alu32M ( X86AluOp op, X86RI* src, X86AMode* dst ) {
+   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag            = Xin_Alu32M;
+   i->Xin.Alu32M.op  = op;
+   i->Xin.Alu32M.src = src;
+   i->Xin.Alu32M.dst = dst;
+   vassert(op != Xalu_MUL);
+   return i;
+}
+X86Instr* X86Instr_Sh32 ( X86ShiftOp op, UInt src, HReg dst ) {
+   X86Instr* i     = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag          = Xin_Sh32;
+   i->Xin.Sh32.op  = op;
+   i->Xin.Sh32.src = src;
+   i->Xin.Sh32.dst = dst;
+   return i;
+}
+X86Instr* X86Instr_Test32 ( UInt imm32, X86RM* dst ) {
+   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag              = Xin_Test32;
+   i->Xin.Test32.imm32 = imm32;
+   i->Xin.Test32.dst   = dst;
+   return i;
+}
+X86Instr* X86Instr_Unary32 ( X86UnaryOp op, HReg dst ) {
+   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag             = Xin_Unary32;
+   i->Xin.Unary32.op  = op;
+   i->Xin.Unary32.dst = dst;
+   return i;
+}
+X86Instr* X86Instr_Lea32 ( X86AMode* am, HReg dst ) {
+   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag             = Xin_Lea32;
+   i->Xin.Lea32.am    = am;
+   i->Xin.Lea32.dst   = dst;
+   return i;
+}
+X86Instr* X86Instr_MulL ( Bool syned, X86RM* src ) {
+   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag             = Xin_MulL;
+   i->Xin.MulL.syned  = syned;
+   i->Xin.MulL.src    = src;
+   return i;
+}
+X86Instr* X86Instr_Div ( Bool syned, X86RM* src ) {
+   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag           = Xin_Div;
+   i->Xin.Div.syned = syned;
+   i->Xin.Div.src   = src;
+   return i;
+}
+X86Instr* X86Instr_Sh3232  ( X86ShiftOp op, UInt amt, HReg src, HReg dst ) {
+   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag            = Xin_Sh3232;
+   i->Xin.Sh3232.op  = op;
+   i->Xin.Sh3232.amt = amt;
+   i->Xin.Sh3232.src = src;
+   i->Xin.Sh3232.dst = dst;
+   vassert(op == Xsh_SHL || op == Xsh_SHR);
+   return i;
+}
+X86Instr* X86Instr_Push( X86RMI* src ) {
+   X86Instr* i     = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag          = Xin_Push;
+   i->Xin.Push.src = src;
+   return i;
+}
+X86Instr* X86Instr_Call ( X86CondCode cond, Addr32 target, Int regparms ) {
+   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag               = Xin_Call;
+   i->Xin.Call.cond     = cond;
+   i->Xin.Call.target   = target;
+   i->Xin.Call.regparms = regparms;
+   vassert(regparms >= 0 && regparms <= 3);
+   return i;
+}
+X86Instr* X86Instr_Goto ( IRJumpKind jk, X86CondCode cond, X86RI* dst ) {
+   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag           = Xin_Goto;
+   i->Xin.Goto.cond = cond;
+   i->Xin.Goto.dst  = dst;
+   i->Xin.Goto.jk   = jk;
+   return i;
+}
+X86Instr* X86Instr_CMov32  ( X86CondCode cond, X86RM* src, HReg dst ) {
+   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag             = Xin_CMov32;
+   i->Xin.CMov32.cond = cond;
+   i->Xin.CMov32.src  = src;
+   i->Xin.CMov32.dst  = dst;
+   vassert(cond != Xcc_ALWAYS);
+   return i;
+}
+X86Instr* X86Instr_LoadEX ( UChar szSmall, Bool syned,
+                            X86AMode* src, HReg dst ) {
+   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag                = Xin_LoadEX;
+   i->Xin.LoadEX.szSmall = szSmall;
+   i->Xin.LoadEX.syned   = syned;
+   i->Xin.LoadEX.src     = src;
+   i->Xin.LoadEX.dst     = dst;
+   vassert(szSmall == 1 || szSmall == 2);
+   return i;
+}
+X86Instr* X86Instr_Store ( UChar sz, HReg src, X86AMode* dst ) {
+   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag           = Xin_Store;
+   i->Xin.Store.sz  = sz;
+   i->Xin.Store.src = src;
+   i->Xin.Store.dst = dst;
+   vassert(sz == 1 || sz == 2);
+   return i;
+}
+X86Instr* X86Instr_Set32 ( X86CondCode cond, HReg dst ) {
+   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag            = Xin_Set32;
+   i->Xin.Set32.cond = cond;
+   i->Xin.Set32.dst  = dst;
+   return i;
+}
+X86Instr* X86Instr_Bsfr32 ( Bool isFwds, HReg src, HReg dst ) {
+   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag               = Xin_Bsfr32;
+   i->Xin.Bsfr32.isFwds = isFwds;
+   i->Xin.Bsfr32.src    = src;
+   i->Xin.Bsfr32.dst    = dst;
+   return i;
+}
+X86Instr* X86Instr_MFence ( UInt hwcaps ) {
+   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag               = Xin_MFence;
+   i->Xin.MFence.hwcaps = hwcaps;
+   vassert(0 == (hwcaps & ~(VEX_HWCAPS_X86_SSE1
+                            |VEX_HWCAPS_X86_SSE2
+                            |VEX_HWCAPS_X86_SSE3
+                            |VEX_HWCAPS_X86_LZCNT)));
+   return i;
+}
+X86Instr* X86Instr_ACAS ( X86AMode* addr, UChar sz ) {
+   X86Instr* i      = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag           = Xin_ACAS;
+   i->Xin.ACAS.addr = addr;
+   i->Xin.ACAS.sz   = sz;
+   vassert(sz == 4 || sz == 2 || sz == 1);
+   return i;
+}
+X86Instr* X86Instr_DACAS ( X86AMode* addr ) {
+   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag            = Xin_DACAS;
+   i->Xin.DACAS.addr = addr;
+   return i;
+}
+
+X86Instr* X86Instr_FpUnary ( X86FpOp op, HReg src, HReg dst ) {
+   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag             = Xin_FpUnary;
+   i->Xin.FpUnary.op  = op;
+   i->Xin.FpUnary.src = src;
+   i->Xin.FpUnary.dst = dst;
+   return i;
+}
+X86Instr* X86Instr_FpBinary ( X86FpOp op, HReg srcL, HReg srcR, HReg dst ) {
+   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag               = Xin_FpBinary;
+   i->Xin.FpBinary.op   = op;
+   i->Xin.FpBinary.srcL = srcL;
+   i->Xin.FpBinary.srcR = srcR;
+   i->Xin.FpBinary.dst  = dst;
+   return i;
+}
+X86Instr* X86Instr_FpLdSt ( Bool isLoad, UChar sz, HReg reg, X86AMode* addr ) {
+   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag               = Xin_FpLdSt;
+   i->Xin.FpLdSt.isLoad = isLoad;
+   i->Xin.FpLdSt.sz     = sz;
+   i->Xin.FpLdSt.reg    = reg;
+   i->Xin.FpLdSt.addr   = addr;
+   vassert(sz == 4 || sz == 8 || sz == 10);
+   return i;
+}
+X86Instr* X86Instr_FpLdStI ( Bool isLoad, UChar sz,  
+                             HReg reg, X86AMode* addr ) {
+   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag                = Xin_FpLdStI;
+   i->Xin.FpLdStI.isLoad = isLoad;
+   i->Xin.FpLdStI.sz     = sz;
+   i->Xin.FpLdStI.reg    = reg;
+   i->Xin.FpLdStI.addr   = addr;
+   vassert(sz == 2 || sz == 4 || sz == 8);
+   return i;
+}
+X86Instr* X86Instr_Fp64to32 ( HReg src, HReg dst ) {
+   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag              = Xin_Fp64to32;
+   i->Xin.Fp64to32.src = src;
+   i->Xin.Fp64to32.dst = dst;
+   return i;
+}
+X86Instr* X86Instr_FpCMov ( X86CondCode cond, HReg src, HReg dst ) {
+   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag             = Xin_FpCMov;
+   i->Xin.FpCMov.cond = cond;
+   i->Xin.FpCMov.src  = src;
+   i->Xin.FpCMov.dst  = dst;
+   vassert(cond != Xcc_ALWAYS);
+   return i;
+}
+X86Instr* X86Instr_FpLdCW ( X86AMode* addr ) {
+   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag               = Xin_FpLdCW;
+   i->Xin.FpLdCW.addr   = addr;
+   return i;
+}
+X86Instr* X86Instr_FpStSW_AX ( void ) {
+   X86Instr* i = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag      = Xin_FpStSW_AX;
+   return i;
+}
+X86Instr* X86Instr_FpCmp ( HReg srcL, HReg srcR, HReg dst ) {
+   X86Instr* i       = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag            = Xin_FpCmp;
+   i->Xin.FpCmp.srcL = srcL;
+   i->Xin.FpCmp.srcR = srcR;
+   i->Xin.FpCmp.dst  = dst;
+   return i;
+}
+
+X86Instr* X86Instr_SseConst ( UShort con, HReg dst ) {
+   X86Instr* i            = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag                 = Xin_SseConst;
+   i->Xin.SseConst.con    = con;
+   i->Xin.SseConst.dst    = dst;
+   vassert(hregClass(dst) == HRcVec128);
+   return i;
+}
+X86Instr* X86Instr_SseLdSt ( Bool isLoad, HReg reg, X86AMode* addr ) {
+   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag                = Xin_SseLdSt;
+   i->Xin.SseLdSt.isLoad = isLoad;
+   i->Xin.SseLdSt.reg    = reg;
+   i->Xin.SseLdSt.addr   = addr;
+   return i;
+}
+X86Instr* X86Instr_SseLdzLO  ( Int sz, HReg reg, X86AMode* addr )
+{
+   X86Instr* i           = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag                = Xin_SseLdzLO;
+   i->Xin.SseLdzLO.sz    = toUChar(sz);
+   i->Xin.SseLdzLO.reg   = reg;
+   i->Xin.SseLdzLO.addr  = addr;
+   vassert(sz == 4 || sz == 8);
+   return i;
+}
+X86Instr* X86Instr_Sse32Fx4 ( X86SseOp op, HReg src, HReg dst ) {
+   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag              = Xin_Sse32Fx4;
+   i->Xin.Sse32Fx4.op  = op;
+   i->Xin.Sse32Fx4.src = src;
+   i->Xin.Sse32Fx4.dst = dst;
+   vassert(op != Xsse_MOV);
+   return i;
+}
+X86Instr* X86Instr_Sse32FLo ( X86SseOp op, HReg src, HReg dst ) {
+   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag              = Xin_Sse32FLo;
+   i->Xin.Sse32FLo.op  = op;
+   i->Xin.Sse32FLo.src = src;
+   i->Xin.Sse32FLo.dst = dst;
+   vassert(op != Xsse_MOV);
+   return i;
+}
+X86Instr* X86Instr_Sse64Fx2 ( X86SseOp op, HReg src, HReg dst ) {
+   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag              = Xin_Sse64Fx2;
+   i->Xin.Sse64Fx2.op  = op;
+   i->Xin.Sse64Fx2.src = src;
+   i->Xin.Sse64Fx2.dst = dst;
+   vassert(op != Xsse_MOV);
+   return i;
+}
+X86Instr* X86Instr_Sse64FLo ( X86SseOp op, HReg src, HReg dst ) {
+   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag              = Xin_Sse64FLo;
+   i->Xin.Sse64FLo.op  = op;
+   i->Xin.Sse64FLo.src = src;
+   i->Xin.Sse64FLo.dst = dst;
+   vassert(op != Xsse_MOV);
+   return i;
+}
+X86Instr* X86Instr_SseReRg ( X86SseOp op, HReg re, HReg rg ) {
+   X86Instr* i        = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag             = Xin_SseReRg;
+   i->Xin.SseReRg.op  = op;
+   i->Xin.SseReRg.src = re;
+   i->Xin.SseReRg.dst = rg;
+   return i;
+}
+X86Instr* X86Instr_SseCMov ( X86CondCode cond, HReg src, HReg dst ) {
+   X86Instr* i         = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag              = Xin_SseCMov;
+   i->Xin.SseCMov.cond = cond;
+   i->Xin.SseCMov.src  = src;
+   i->Xin.SseCMov.dst  = dst;
+   vassert(cond != Xcc_ALWAYS);
+   return i;
+}
+X86Instr* X86Instr_SseShuf ( Int order, HReg src, HReg dst ) {
+   X86Instr* i          = LibVEX_Alloc(sizeof(X86Instr));
+   i->tag               = Xin_SseShuf;
+   i->Xin.SseShuf.order = order;
+   i->Xin.SseShuf.src   = src;
+   i->Xin.SseShuf.dst   = dst;
+   vassert(order >= 0 && order <= 0xFF);
+   return i;
+}
+
+void ppX86Instr ( X86Instr* i, Bool mode64 ) {
+   vassert(mode64 == False);
+   switch (i->tag) {
+      case Xin_Alu32R:
+         vex_printf("%sl ", showX86AluOp(i->Xin.Alu32R.op));
+         ppX86RMI(i->Xin.Alu32R.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.Alu32R.dst);
+         return;
+      case Xin_Alu32M:
+         vex_printf("%sl ", showX86AluOp(i->Xin.Alu32M.op));
+         ppX86RI(i->Xin.Alu32M.src);
+         vex_printf(",");
+         ppX86AMode(i->Xin.Alu32M.dst);
+         return;
+      case Xin_Sh32:
+         vex_printf("%sl ", showX86ShiftOp(i->Xin.Sh32.op));
+         if (i->Xin.Sh32.src == 0)
+           vex_printf("%%cl,"); 
+         else 
+            vex_printf("$%d,", (Int)i->Xin.Sh32.src);
+         ppHRegX86(i->Xin.Sh32.dst);
+         return;
+      case Xin_Test32:
+         vex_printf("testl $%d,", (Int)i->Xin.Test32.imm32);
+         ppX86RM(i->Xin.Test32.dst);
+         return;
+      case Xin_Unary32:
+         vex_printf("%sl ", showX86UnaryOp(i->Xin.Unary32.op));
+         ppHRegX86(i->Xin.Unary32.dst);
+         return;
+      case Xin_Lea32:
+         vex_printf("leal ");
+         ppX86AMode(i->Xin.Lea32.am);
+         vex_printf(",");
+         ppHRegX86(i->Xin.Lea32.dst);
+         return;
+      case Xin_MulL:
+         vex_printf("%cmull ", i->Xin.MulL.syned ? 's' : 'u');
+         ppX86RM(i->Xin.MulL.src);
+         return;
+      case Xin_Div:
+         vex_printf("%cdivl ", i->Xin.Div.syned ? 's' : 'u');
+         ppX86RM(i->Xin.Div.src);
+         return;
+      case Xin_Sh3232:
+         vex_printf("%sdl ", showX86ShiftOp(i->Xin.Sh3232.op));
+         if (i->Xin.Sh3232.amt == 0)
+           vex_printf(" %%cl,"); 
+         else 
+            vex_printf(" $%d,", (Int)i->Xin.Sh3232.amt);
+         ppHRegX86(i->Xin.Sh3232.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.Sh3232.dst);
+         return;
+      case Xin_Push:
+         vex_printf("pushl ");
+         ppX86RMI(i->Xin.Push.src);
+         return;
+      case Xin_Call:
+         vex_printf("call%s[%d] ", 
+                    i->Xin.Call.cond==Xcc_ALWAYS 
+                       ? "" : showX86CondCode(i->Xin.Call.cond), 
+                    i->Xin.Call.regparms);
+         vex_printf("0x%x", i->Xin.Call.target);
+         break;
+      case Xin_Goto:
+         if (i->Xin.Goto.cond != Xcc_ALWAYS) {
+            vex_printf("if (%%eflags.%s) { ", 
+                       showX86CondCode(i->Xin.Goto.cond));
+	 }
+         if (i->Xin.Goto.jk != Ijk_Boring
+             && i->Xin.Goto.jk != Ijk_Call
+             && i->Xin.Goto.jk != Ijk_Ret) {
+            vex_printf("movl $");
+            ppIRJumpKind(i->Xin.Goto.jk);
+            vex_printf(",%%ebp ; ");
+         }
+         vex_printf("movl ");
+         ppX86RI(i->Xin.Goto.dst);
+         vex_printf(",%%eax ; movl $dispatcher_addr,%%edx ; jmp *%%edx");
+         if (i->Xin.Goto.cond != Xcc_ALWAYS) {
+            vex_printf(" }");
+	 }
+         return;
+      case Xin_CMov32:
+         vex_printf("cmov%s ", showX86CondCode(i->Xin.CMov32.cond));
+         ppX86RM(i->Xin.CMov32.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.CMov32.dst);
+         return;
+      case Xin_LoadEX:
+         vex_printf("mov%c%cl ",
+                    i->Xin.LoadEX.syned ? 's' : 'z',
+                    i->Xin.LoadEX.szSmall==1 ? 'b' : 'w');
+         ppX86AMode(i->Xin.LoadEX.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.LoadEX.dst);
+         return;
+      case Xin_Store:
+         vex_printf("mov%c ", i->Xin.Store.sz==1 ? 'b' : 'w');
+         ppHRegX86(i->Xin.Store.src);
+         vex_printf(",");
+         ppX86AMode(i->Xin.Store.dst);
+         return;
+      case Xin_Set32:
+         vex_printf("setl%s ", showX86CondCode(i->Xin.Set32.cond));
+         ppHRegX86(i->Xin.Set32.dst);
+         return;
+      case Xin_Bsfr32:
+         vex_printf("bs%cl ", i->Xin.Bsfr32.isFwds ? 'f' : 'r');
+         ppHRegX86(i->Xin.Bsfr32.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.Bsfr32.dst);
+         return;
+      case Xin_MFence:
+         vex_printf("mfence(%s)",
+                    LibVEX_ppVexHwCaps(VexArchX86,i->Xin.MFence.hwcaps));
+         return;
+      case Xin_ACAS:
+         vex_printf("lock cmpxchg%c ",
+                     i->Xin.ACAS.sz==1 ? 'b' 
+                                       : i->Xin.ACAS.sz==2 ? 'w' : 'l');
+         vex_printf("{%%eax->%%ebx},");
+         ppX86AMode(i->Xin.ACAS.addr);
+         return;
+      case Xin_DACAS:
+         vex_printf("lock cmpxchg8b {%%edx:%%eax->%%ecx:%%ebx},");
+         ppX86AMode(i->Xin.DACAS.addr);
+         return;
+      case Xin_FpUnary:
+         vex_printf("g%sD ", showX86FpOp(i->Xin.FpUnary.op));
+         ppHRegX86(i->Xin.FpUnary.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.FpUnary.dst);
+         break;
+      case Xin_FpBinary:
+         vex_printf("g%sD ", showX86FpOp(i->Xin.FpBinary.op));
+         ppHRegX86(i->Xin.FpBinary.srcL);
+         vex_printf(",");
+         ppHRegX86(i->Xin.FpBinary.srcR);
+         vex_printf(",");
+         ppHRegX86(i->Xin.FpBinary.dst);
+         break;
+      case Xin_FpLdSt:
+         if (i->Xin.FpLdSt.isLoad) {
+            vex_printf("gld%c " ,  i->Xin.FpLdSt.sz==10 ? 'T'
+                                   : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
+            ppX86AMode(i->Xin.FpLdSt.addr);
+            vex_printf(", ");
+            ppHRegX86(i->Xin.FpLdSt.reg);
+         } else {
+            vex_printf("gst%c " , i->Xin.FpLdSt.sz==10 ? 'T'
+                                  : (i->Xin.FpLdSt.sz==8 ? 'D' : 'F'));
+            ppHRegX86(i->Xin.FpLdSt.reg);
+            vex_printf(", ");
+            ppX86AMode(i->Xin.FpLdSt.addr);
+         }
+         return;
+      case Xin_FpLdStI:
+         if (i->Xin.FpLdStI.isLoad) {
+            vex_printf("gild%s ", i->Xin.FpLdStI.sz==8 ? "ll" : 
+                                  i->Xin.FpLdStI.sz==4 ? "l" : "w");
+            ppX86AMode(i->Xin.FpLdStI.addr);
+            vex_printf(", ");
+            ppHRegX86(i->Xin.FpLdStI.reg);
+         } else {
+            vex_printf("gist%s ", i->Xin.FpLdStI.sz==8 ? "ll" : 
+                                  i->Xin.FpLdStI.sz==4 ? "l" : "w");
+            ppHRegX86(i->Xin.FpLdStI.reg);
+            vex_printf(", ");
+            ppX86AMode(i->Xin.FpLdStI.addr);
+         }
+         return;
+      case Xin_Fp64to32:
+         vex_printf("gdtof ");
+         ppHRegX86(i->Xin.Fp64to32.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.Fp64to32.dst);
+         return;
+      case Xin_FpCMov:
+         vex_printf("gcmov%s ", showX86CondCode(i->Xin.FpCMov.cond));
+         ppHRegX86(i->Xin.FpCMov.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.FpCMov.dst);
+         return;
+      case Xin_FpLdCW:
+         vex_printf("fldcw ");
+         ppX86AMode(i->Xin.FpLdCW.addr);
+         return;
+      case Xin_FpStSW_AX:
+         vex_printf("fstsw %%ax");
+         return;
+      case Xin_FpCmp:
+         vex_printf("gcmp ");
+         ppHRegX86(i->Xin.FpCmp.srcL);
+         vex_printf(",");
+         ppHRegX86(i->Xin.FpCmp.srcR);
+         vex_printf(",");
+         ppHRegX86(i->Xin.FpCmp.dst);
+         break;
+      case Xin_SseConst:
+         vex_printf("const $0x%04x,", (Int)i->Xin.SseConst.con);
+         ppHRegX86(i->Xin.SseConst.dst);
+         break;
+      case Xin_SseLdSt:
+         vex_printf("movups ");
+         if (i->Xin.SseLdSt.isLoad) {
+            ppX86AMode(i->Xin.SseLdSt.addr);
+            vex_printf(",");
+            ppHRegX86(i->Xin.SseLdSt.reg);
+         } else {
+            ppHRegX86(i->Xin.SseLdSt.reg);
+            vex_printf(",");
+            ppX86AMode(i->Xin.SseLdSt.addr);
+         }
+         return;
+      case Xin_SseLdzLO:
+         vex_printf("movs%s ", i->Xin.SseLdzLO.sz==4 ? "s" : "d");
+         ppX86AMode(i->Xin.SseLdzLO.addr);
+         vex_printf(",");
+         ppHRegX86(i->Xin.SseLdzLO.reg);
+         return;
+      case Xin_Sse32Fx4:
+         vex_printf("%sps ", showX86SseOp(i->Xin.Sse32Fx4.op));
+         ppHRegX86(i->Xin.Sse32Fx4.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.Sse32Fx4.dst);
+         return;
+      case Xin_Sse32FLo:
+         vex_printf("%sss ", showX86SseOp(i->Xin.Sse32FLo.op));
+         ppHRegX86(i->Xin.Sse32FLo.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.Sse32FLo.dst);
+         return;
+      case Xin_Sse64Fx2:
+         vex_printf("%spd ", showX86SseOp(i->Xin.Sse64Fx2.op));
+         ppHRegX86(i->Xin.Sse64Fx2.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.Sse64Fx2.dst);
+         return;
+      case Xin_Sse64FLo:
+         vex_printf("%ssd ", showX86SseOp(i->Xin.Sse64FLo.op));
+         ppHRegX86(i->Xin.Sse64FLo.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.Sse64FLo.dst);
+         return;
+      case Xin_SseReRg:
+         vex_printf("%s ", showX86SseOp(i->Xin.SseReRg.op));
+         ppHRegX86(i->Xin.SseReRg.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.SseReRg.dst);
+         return;
+      case Xin_SseCMov:
+         vex_printf("cmov%s ", showX86CondCode(i->Xin.SseCMov.cond));
+         ppHRegX86(i->Xin.SseCMov.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.SseCMov.dst);
+         return;
+      case Xin_SseShuf:
+         vex_printf("pshufd $0x%x,", i->Xin.SseShuf.order);
+         ppHRegX86(i->Xin.SseShuf.src);
+         vex_printf(",");
+         ppHRegX86(i->Xin.SseShuf.dst);
+         return;
+
+      default:
+         vpanic("ppX86Instr");
+   }
+}
+
+/* --------- Helpers for register allocation. --------- */
+
+void getRegUsage_X86Instr (HRegUsage* u, X86Instr* i, Bool mode64)
+{
+   Bool unary;
+   vassert(mode64 == False);
+   initHRegUsage(u);
+   switch (i->tag) {
+      case Xin_Alu32R:
+         addRegUsage_X86RMI(u, i->Xin.Alu32R.src);
+         if (i->Xin.Alu32R.op == Xalu_MOV) {
+            addHRegUse(u, HRmWrite, i->Xin.Alu32R.dst);
+            return;
+         }
+         if (i->Xin.Alu32R.op == Xalu_CMP) { 
+            addHRegUse(u, HRmRead, i->Xin.Alu32R.dst);
+            return;
+         }
+         addHRegUse(u, HRmModify, i->Xin.Alu32R.dst);
+         return;
+      case Xin_Alu32M:
+         addRegUsage_X86RI(u, i->Xin.Alu32M.src);
+         addRegUsage_X86AMode(u, i->Xin.Alu32M.dst);
+         return;
+      case Xin_Sh32:
+         addHRegUse(u, HRmModify, i->Xin.Sh32.dst);
+         if (i->Xin.Sh32.src == 0)
+            addHRegUse(u, HRmRead, hregX86_ECX());
+         return;
+      case Xin_Test32:
+         addRegUsage_X86RM(u, i->Xin.Test32.dst, HRmRead);
+         return;
+      case Xin_Unary32:
+         addHRegUse(u, HRmModify, i->Xin.Unary32.dst);
+         return;
+      case Xin_Lea32:
+         addRegUsage_X86AMode(u, i->Xin.Lea32.am);
+         addHRegUse(u, HRmWrite, i->Xin.Lea32.dst);
+         return;
+      case Xin_MulL:
+         addRegUsage_X86RM(u, i->Xin.MulL.src, HRmRead);
+         addHRegUse(u, HRmModify, hregX86_EAX());
+         addHRegUse(u, HRmWrite, hregX86_EDX());
+         return;
+      case Xin_Div:
+         addRegUsage_X86RM(u, i->Xin.Div.src, HRmRead);
+         addHRegUse(u, HRmModify, hregX86_EAX());
+         addHRegUse(u, HRmModify, hregX86_EDX());
+         return;
+      case Xin_Sh3232:
+         addHRegUse(u, HRmRead, i->Xin.Sh3232.src);
+         addHRegUse(u, HRmModify, i->Xin.Sh3232.dst);
+         if (i->Xin.Sh3232.amt == 0)
+            addHRegUse(u, HRmRead, hregX86_ECX());
+         return;
+      case Xin_Push:
+         addRegUsage_X86RMI(u, i->Xin.Push.src);
+         addHRegUse(u, HRmModify, hregX86_ESP());
+         return;
+      case Xin_Call:
+         /* This is a bit subtle. */
+         /* First off, claim it trashes all the caller-saved regs
+            which fall within the register allocator's jurisdiction.
+            These I believe to be %eax %ecx %edx and all the xmm
+            registers. */
+         addHRegUse(u, HRmWrite, hregX86_EAX());
+         addHRegUse(u, HRmWrite, hregX86_ECX());
+         addHRegUse(u, HRmWrite, hregX86_EDX());
+         addHRegUse(u, HRmWrite, hregX86_XMM0());
+         addHRegUse(u, HRmWrite, hregX86_XMM1());
+         addHRegUse(u, HRmWrite, hregX86_XMM2());
+         addHRegUse(u, HRmWrite, hregX86_XMM3());
+         addHRegUse(u, HRmWrite, hregX86_XMM4());
+         addHRegUse(u, HRmWrite, hregX86_XMM5());
+         addHRegUse(u, HRmWrite, hregX86_XMM6());
+         addHRegUse(u, HRmWrite, hregX86_XMM7());
+         /* Now we have to state any parameter-carrying registers
+            which might be read.  This depends on the regparmness. */
+         switch (i->Xin.Call.regparms) {
+            case 3: addHRegUse(u, HRmRead, hregX86_ECX()); /*fallthru*/
+            case 2: addHRegUse(u, HRmRead, hregX86_EDX()); /*fallthru*/
+            case 1: addHRegUse(u, HRmRead, hregX86_EAX()); break;
+            case 0: break;
+            default: vpanic("getRegUsage_X86Instr:Call:regparms");
+         }
+         /* Finally, there is the issue that the insn trashes a
+            register because the literal target address has to be
+            loaded into a register.  Fortunately, for the 0/1/2
+            regparm case, we can use EAX, EDX and ECX respectively, so
+            this does not cause any further damage.  For the 3-regparm
+            case, we'll have to choose another register arbitrarily --
+            since A, D and C are used for parameters -- and so we might
+            as well choose EDI. */
+         if (i->Xin.Call.regparms == 3)
+            addHRegUse(u, HRmWrite, hregX86_EDI());
+         /* Upshot of this is that the assembler really must observe
+            the here-stated convention of which register to use as an
+            address temporary, depending on the regparmness: 0==EAX,
+            1==EDX, 2==ECX, 3==EDI. */
+         return;
+      case Xin_Goto:
+         addRegUsage_X86RI(u, i->Xin.Goto.dst);
+         addHRegUse(u, HRmWrite, hregX86_EAX()); /* used for next guest addr */
+         addHRegUse(u, HRmWrite, hregX86_EDX()); /* used for dispatcher addr */
+         if (i->Xin.Goto.jk != Ijk_Boring
+             && i->Xin.Goto.jk != Ijk_Call
+             && i->Xin.Goto.jk != Ijk_Ret)
+            /* note, this is irrelevant since ebp is not actually
+               available to the allocator.  But still .. */
+            addHRegUse(u, HRmWrite, hregX86_EBP());
+         return;
+      case Xin_CMov32:
+         addRegUsage_X86RM(u, i->Xin.CMov32.src, HRmRead);
+         addHRegUse(u, HRmModify, i->Xin.CMov32.dst);
+         return;
+      case Xin_LoadEX:
+         addRegUsage_X86AMode(u, i->Xin.LoadEX.src);
+         addHRegUse(u, HRmWrite, i->Xin.LoadEX.dst);
+         return;
+      case Xin_Store:
+         addHRegUse(u, HRmRead, i->Xin.Store.src);
+         addRegUsage_X86AMode(u, i->Xin.Store.dst);
+         return;
+      case Xin_Set32:
+         addHRegUse(u, HRmWrite, i->Xin.Set32.dst);
+         return;
+      case Xin_Bsfr32:
+         addHRegUse(u, HRmRead, i->Xin.Bsfr32.src);
+         addHRegUse(u, HRmWrite, i->Xin.Bsfr32.dst);
+         return;
+      case Xin_MFence:
+         return;
+      case Xin_ACAS:
+         addRegUsage_X86AMode(u, i->Xin.ACAS.addr);
+         addHRegUse(u, HRmRead, hregX86_EBX());
+         addHRegUse(u, HRmModify, hregX86_EAX());
+         return;
+      case Xin_DACAS:
+         addRegUsage_X86AMode(u, i->Xin.DACAS.addr);
+         addHRegUse(u, HRmRead, hregX86_ECX());
+         addHRegUse(u, HRmRead, hregX86_EBX());
+         addHRegUse(u, HRmModify, hregX86_EDX());
+         addHRegUse(u, HRmModify, hregX86_EAX());
+         return;
+      case Xin_FpUnary:
+         addHRegUse(u, HRmRead, i->Xin.FpUnary.src);
+         addHRegUse(u, HRmWrite, i->Xin.FpUnary.dst);
+         return;
+      case Xin_FpBinary:
+         addHRegUse(u, HRmRead, i->Xin.FpBinary.srcL);
+         addHRegUse(u, HRmRead, i->Xin.FpBinary.srcR);
+         addHRegUse(u, HRmWrite, i->Xin.FpBinary.dst);
+         return;
+      case Xin_FpLdSt:
+         addRegUsage_X86AMode(u, i->Xin.FpLdSt.addr);
+         addHRegUse(u, i->Xin.FpLdSt.isLoad ? HRmWrite : HRmRead,
+                       i->Xin.FpLdSt.reg);
+         return;
+      case Xin_FpLdStI:
+         addRegUsage_X86AMode(u, i->Xin.FpLdStI.addr);
+         addHRegUse(u, i->Xin.FpLdStI.isLoad ? HRmWrite : HRmRead,
+                       i->Xin.FpLdStI.reg);
+         return;
+      case Xin_Fp64to32:
+         addHRegUse(u, HRmRead,  i->Xin.Fp64to32.src);
+         addHRegUse(u, HRmWrite, i->Xin.Fp64to32.dst);
+         return;
+      case Xin_FpCMov:
+         addHRegUse(u, HRmRead,   i->Xin.FpCMov.src);
+         addHRegUse(u, HRmModify, i->Xin.FpCMov.dst);
+         return;
+      case Xin_FpLdCW:
+         addRegUsage_X86AMode(u, i->Xin.FpLdCW.addr);
+         return;
+      case Xin_FpStSW_AX:
+         addHRegUse(u, HRmWrite, hregX86_EAX());
+         return;
+      case Xin_FpCmp:
+         addHRegUse(u, HRmRead, i->Xin.FpCmp.srcL);
+         addHRegUse(u, HRmRead, i->Xin.FpCmp.srcR);
+         addHRegUse(u, HRmWrite, i->Xin.FpCmp.dst);
+         addHRegUse(u, HRmWrite, hregX86_EAX());
+         return;
+      case Xin_SseLdSt:
+         addRegUsage_X86AMode(u, i->Xin.SseLdSt.addr);
+         addHRegUse(u, i->Xin.SseLdSt.isLoad ? HRmWrite : HRmRead,
+                       i->Xin.SseLdSt.reg);
+         return;
+      case Xin_SseLdzLO:
+         addRegUsage_X86AMode(u, i->Xin.SseLdzLO.addr);
+         addHRegUse(u, HRmWrite, i->Xin.SseLdzLO.reg);
+         return;
+      case Xin_SseConst:
+         addHRegUse(u, HRmWrite, i->Xin.SseConst.dst);
+         return;
+      case Xin_Sse32Fx4:
+         vassert(i->Xin.Sse32Fx4.op != Xsse_MOV);
+         unary = toBool( i->Xin.Sse32Fx4.op == Xsse_RCPF
+                         || i->Xin.Sse32Fx4.op == Xsse_RSQRTF
+                         || i->Xin.Sse32Fx4.op == Xsse_SQRTF );
+         addHRegUse(u, HRmRead, i->Xin.Sse32Fx4.src);
+         addHRegUse(u, unary ? HRmWrite : HRmModify, 
+                       i->Xin.Sse32Fx4.dst);
+         return;
+      case Xin_Sse32FLo:
+         vassert(i->Xin.Sse32FLo.op != Xsse_MOV);
+         unary = toBool( i->Xin.Sse32FLo.op == Xsse_RCPF
+                         || i->Xin.Sse32FLo.op == Xsse_RSQRTF
+                         || i->Xin.Sse32FLo.op == Xsse_SQRTF );
+         addHRegUse(u, HRmRead, i->Xin.Sse32FLo.src);
+         addHRegUse(u, unary ? HRmWrite : HRmModify, 
+                       i->Xin.Sse32FLo.dst);
+         return;
+      case Xin_Sse64Fx2:
+         vassert(i->Xin.Sse64Fx2.op != Xsse_MOV);
+         unary = toBool( i->Xin.Sse64Fx2.op == Xsse_RCPF
+                         || i->Xin.Sse64Fx2.op == Xsse_RSQRTF
+                         || i->Xin.Sse64Fx2.op == Xsse_SQRTF );
+         addHRegUse(u, HRmRead, i->Xin.Sse64Fx2.src);
+         addHRegUse(u, unary ? HRmWrite : HRmModify, 
+                       i->Xin.Sse64Fx2.dst);
+         return;
+      case Xin_Sse64FLo:
+         vassert(i->Xin.Sse64FLo.op != Xsse_MOV);
+         unary = toBool( i->Xin.Sse64FLo.op == Xsse_RCPF
+                         || i->Xin.Sse64FLo.op == Xsse_RSQRTF
+                         || i->Xin.Sse64FLo.op == Xsse_SQRTF );
+         addHRegUse(u, HRmRead, i->Xin.Sse64FLo.src);
+         addHRegUse(u, unary ? HRmWrite : HRmModify, 
+                       i->Xin.Sse64FLo.dst);
+         return;
+      case Xin_SseReRg:
+         if (i->Xin.SseReRg.op == Xsse_XOR
+             && i->Xin.SseReRg.src == i->Xin.SseReRg.dst) {
+            /* reg-alloc needs to understand 'xor r,r' as a write of r */
+            /* (as opposed to a rite of passage :-) */
+            addHRegUse(u, HRmWrite, i->Xin.SseReRg.dst);
+         } else {
+            addHRegUse(u, HRmRead, i->Xin.SseReRg.src);
+            addHRegUse(u, i->Xin.SseReRg.op == Xsse_MOV 
+                             ? HRmWrite : HRmModify, 
+                          i->Xin.SseReRg.dst);
+         }
+         return;
+      case Xin_SseCMov:
+         addHRegUse(u, HRmRead,   i->Xin.SseCMov.src);
+         addHRegUse(u, HRmModify, i->Xin.SseCMov.dst);
+         return;
+      case Xin_SseShuf:
+         addHRegUse(u, HRmRead,  i->Xin.SseShuf.src);
+         addHRegUse(u, HRmWrite, i->Xin.SseShuf.dst);
+         return;
+      default:
+         ppX86Instr(i, False);
+         vpanic("getRegUsage_X86Instr");
+   }
+}
+
+/* local helper */
+static void mapReg( HRegRemap* m, HReg* r )
+{
+   *r = lookupHRegRemap(m, *r);
+}
+
+void mapRegs_X86Instr ( HRegRemap* m, X86Instr* i, Bool mode64 )
+{
+   vassert(mode64 == False);
+   switch (i->tag) {
+      case Xin_Alu32R:
+         mapRegs_X86RMI(m, i->Xin.Alu32R.src);
+         mapReg(m, &i->Xin.Alu32R.dst);
+         return;
+      case Xin_Alu32M:
+         mapRegs_X86RI(m, i->Xin.Alu32M.src);
+         mapRegs_X86AMode(m, i->Xin.Alu32M.dst);
+         return;
+      case Xin_Sh32:
+         mapReg(m, &i->Xin.Sh32.dst);
+         return;
+      case Xin_Test32:
+         mapRegs_X86RM(m, i->Xin.Test32.dst);
+         return;
+      case Xin_Unary32:
+         mapReg(m, &i->Xin.Unary32.dst);
+         return;
+      case Xin_Lea32:
+         mapRegs_X86AMode(m, i->Xin.Lea32.am);
+         mapReg(m, &i->Xin.Lea32.dst);
+         return;
+      case Xin_MulL:
+         mapRegs_X86RM(m, i->Xin.MulL.src);
+         return;
+      case Xin_Div:
+         mapRegs_X86RM(m, i->Xin.Div.src);
+         return;
+      case Xin_Sh3232:
+         mapReg(m, &i->Xin.Sh3232.src);
+         mapReg(m, &i->Xin.Sh3232.dst);
+         return;
+      case Xin_Push:
+         mapRegs_X86RMI(m, i->Xin.Push.src);
+         return;
+      case Xin_Call:
+         return;
+      case Xin_Goto:
+         mapRegs_X86RI(m, i->Xin.Goto.dst);
+         return;
+      case Xin_CMov32:
+         mapRegs_X86RM(m, i->Xin.CMov32.src);
+         mapReg(m, &i->Xin.CMov32.dst);
+         return;
+      case Xin_LoadEX:
+         mapRegs_X86AMode(m, i->Xin.LoadEX.src);
+         mapReg(m, &i->Xin.LoadEX.dst);
+         return;
+      case Xin_Store:
+         mapReg(m, &i->Xin.Store.src);
+         mapRegs_X86AMode(m, i->Xin.Store.dst);
+         return;
+      case Xin_Set32:
+         mapReg(m, &i->Xin.Set32.dst);
+         return;
+      case Xin_Bsfr32:
+         mapReg(m, &i->Xin.Bsfr32.src);
+         mapReg(m, &i->Xin.Bsfr32.dst);
+         return;
+      case Xin_MFence:
+         return;
+      case Xin_ACAS:
+         mapRegs_X86AMode(m, i->Xin.ACAS.addr);
+         return;
+      case Xin_DACAS:
+         mapRegs_X86AMode(m, i->Xin.DACAS.addr);
+         return;
+      case Xin_FpUnary:
+         mapReg(m, &i->Xin.FpUnary.src);
+         mapReg(m, &i->Xin.FpUnary.dst);
+         return;
+      case Xin_FpBinary:
+         mapReg(m, &i->Xin.FpBinary.srcL);
+         mapReg(m, &i->Xin.FpBinary.srcR);
+         mapReg(m, &i->Xin.FpBinary.dst);
+         return;
+      case Xin_FpLdSt:
+         mapRegs_X86AMode(m, i->Xin.FpLdSt.addr);
+         mapReg(m, &i->Xin.FpLdSt.reg);
+         return;
+      case Xin_FpLdStI:
+         mapRegs_X86AMode(m, i->Xin.FpLdStI.addr);
+         mapReg(m, &i->Xin.FpLdStI.reg);
+         return;
+      case Xin_Fp64to32:
+         mapReg(m, &i->Xin.Fp64to32.src);
+         mapReg(m, &i->Xin.Fp64to32.dst);
+         return;
+      case Xin_FpCMov:
+         mapReg(m, &i->Xin.FpCMov.src);
+         mapReg(m, &i->Xin.FpCMov.dst);
+         return;
+      case Xin_FpLdCW:
+         mapRegs_X86AMode(m, i->Xin.FpLdCW.addr);
+         return;
+      case Xin_FpStSW_AX:
+         return;
+      case Xin_FpCmp:
+         mapReg(m, &i->Xin.FpCmp.srcL);
+         mapReg(m, &i->Xin.FpCmp.srcR);
+         mapReg(m, &i->Xin.FpCmp.dst);
+         return;
+      case Xin_SseConst:
+         mapReg(m, &i->Xin.SseConst.dst);
+         return;
+      case Xin_SseLdSt:
+         mapReg(m, &i->Xin.SseLdSt.reg);
+         mapRegs_X86AMode(m, i->Xin.SseLdSt.addr);
+         break;
+      case Xin_SseLdzLO:
+         mapReg(m, &i->Xin.SseLdzLO.reg);
+         mapRegs_X86AMode(m, i->Xin.SseLdzLO.addr);
+         break;
+      case Xin_Sse32Fx4:
+         mapReg(m, &i->Xin.Sse32Fx4.src);
+         mapReg(m, &i->Xin.Sse32Fx4.dst);
+         return;
+      case Xin_Sse32FLo:
+         mapReg(m, &i->Xin.Sse32FLo.src);
+         mapReg(m, &i->Xin.Sse32FLo.dst);
+         return;
+      case Xin_Sse64Fx2:
+         mapReg(m, &i->Xin.Sse64Fx2.src);
+         mapReg(m, &i->Xin.Sse64Fx2.dst);
+         return;
+      case Xin_Sse64FLo:
+         mapReg(m, &i->Xin.Sse64FLo.src);
+         mapReg(m, &i->Xin.Sse64FLo.dst);
+         return;
+      case Xin_SseReRg:
+         mapReg(m, &i->Xin.SseReRg.src);
+         mapReg(m, &i->Xin.SseReRg.dst);
+         return;
+      case Xin_SseCMov:
+         mapReg(m, &i->Xin.SseCMov.src);
+         mapReg(m, &i->Xin.SseCMov.dst);
+         return;
+      case Xin_SseShuf:
+         mapReg(m, &i->Xin.SseShuf.src);
+         mapReg(m, &i->Xin.SseShuf.dst);
+         return;
+      default:
+         ppX86Instr(i, mode64);
+         vpanic("mapRegs_X86Instr");
+   }
+}
+
+/* Figure out if i represents a reg-reg move, and if so assign the
+   source and destination to *src and *dst.  If in doubt say No.  Used
+   by the register allocator to do move coalescing. 
+*/
+Bool isMove_X86Instr ( X86Instr* i, HReg* src, HReg* dst )
+{
+   /* Moves between integer regs */
+   if (i->tag == Xin_Alu32R) {
+      if (i->Xin.Alu32R.op != Xalu_MOV)
+         return False;
+      if (i->Xin.Alu32R.src->tag != Xrmi_Reg)
+         return False;
+      *src = i->Xin.Alu32R.src->Xrmi.Reg.reg;
+      *dst = i->Xin.Alu32R.dst;
+      return True;
+   }
+   /* Moves between FP regs */
+   if (i->tag == Xin_FpUnary) {
+      if (i->Xin.FpUnary.op != Xfp_MOV)
+         return False;
+      *src = i->Xin.FpUnary.src;
+      *dst = i->Xin.FpUnary.dst;
+      return True;
+   }
+   if (i->tag == Xin_SseReRg) {
+      if (i->Xin.SseReRg.op != Xsse_MOV)
+         return False;
+      *src = i->Xin.SseReRg.src;
+      *dst = i->Xin.SseReRg.dst;
+      return True;
+   }
+   return False;
+}
+
+
+/* Generate x86 spill/reload instructions under the direction of the
+   register allocator.  Note it's critical these don't write the
+   condition codes. */
+
+void genSpill_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                    HReg rreg, Int offsetB, Bool mode64 )
+{
+   X86AMode* am;
+   vassert(offsetB >= 0);
+   vassert(!hregIsVirtual(rreg));
+   vassert(mode64 == False);
+   *i1 = *i2 = NULL;
+   am = X86AMode_IR(offsetB, hregX86_EBP());
+   switch (hregClass(rreg)) {
+      case HRcInt32:
+         *i1 = X86Instr_Alu32M ( Xalu_MOV, X86RI_Reg(rreg), am );
+         return;
+      case HRcFlt64:
+         *i1 = X86Instr_FpLdSt ( False/*store*/, 10, rreg, am );
+         return;
+      case HRcVec128:
+         *i1 = X86Instr_SseLdSt ( False/*store*/, rreg, am );
+         return;
+      default: 
+         ppHRegClass(hregClass(rreg));
+         vpanic("genSpill_X86: unimplemented regclass");
+   }
+}
+
+void genReload_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                     HReg rreg, Int offsetB, Bool mode64 )
+{
+   X86AMode* am;
+   vassert(offsetB >= 0);
+   vassert(!hregIsVirtual(rreg));
+   vassert(mode64 == False);
+   *i1 = *i2 = NULL;
+   am = X86AMode_IR(offsetB, hregX86_EBP());
+   switch (hregClass(rreg)) {
+      case HRcInt32:
+         *i1 = X86Instr_Alu32R ( Xalu_MOV, X86RMI_Mem(am), rreg );
+         return;
+      case HRcFlt64:
+         *i1 = X86Instr_FpLdSt ( True/*load*/, 10, rreg, am );
+         return;
+      case HRcVec128:
+         *i1 = X86Instr_SseLdSt ( True/*load*/, rreg, am );
+         return;
+      default: 
+         ppHRegClass(hregClass(rreg));
+         vpanic("genReload_X86: unimplemented regclass");
+   }
+}
+
+/* The given instruction reads the specified vreg exactly once, and
+   that vreg is currently located at the given spill offset.  If
+   possible, return a variant of the instruction to one which instead
+   references the spill slot directly. */
+
+X86Instr* directReload_X86( X86Instr* i, HReg vreg, Short spill_off )
+{
+   vassert(spill_off >= 0 && spill_off < 10000); /* let's say */
+
+   /* Deal with form: src=RMI_Reg, dst=Reg where src == vreg 
+      Convert to: src=RMI_Mem, dst=Reg 
+   */
+   if (i->tag == Xin_Alu32R
+       && (i->Xin.Alu32R.op == Xalu_MOV || i->Xin.Alu32R.op == Xalu_OR
+           || i->Xin.Alu32R.op == Xalu_XOR)
+       && i->Xin.Alu32R.src->tag == Xrmi_Reg
+       && i->Xin.Alu32R.src->Xrmi.Reg.reg == vreg) {
+      vassert(i->Xin.Alu32R.dst != vreg);
+      return X86Instr_Alu32R( 
+                i->Xin.Alu32R.op, 
+                X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP())),
+                i->Xin.Alu32R.dst
+             );
+   }
+
+   /* Deal with form: src=RMI_Imm, dst=Reg where dst == vreg 
+      Convert to: src=RI_Imm, dst=Mem
+   */
+   if (i->tag == Xin_Alu32R
+       && (i->Xin.Alu32R.op == Xalu_CMP)
+       && i->Xin.Alu32R.src->tag == Xrmi_Imm
+       && i->Xin.Alu32R.dst == vreg) {
+      return X86Instr_Alu32M( 
+                i->Xin.Alu32R.op,
+		X86RI_Imm( i->Xin.Alu32R.src->Xrmi.Imm.imm32 ),
+                X86AMode_IR( spill_off, hregX86_EBP())
+             );
+   }
+
+   /* Deal with form: Push(RMI_Reg)
+      Convert to: Push(RMI_Mem) 
+   */
+   if (i->tag == Xin_Push
+       && i->Xin.Push.src->tag == Xrmi_Reg
+       && i->Xin.Push.src->Xrmi.Reg.reg == vreg) {
+      return X86Instr_Push(
+                X86RMI_Mem( X86AMode_IR( spill_off, hregX86_EBP()))
+             );
+   }
+
+   /* Deal with form: CMov32(src=RM_Reg, dst) where vreg == src
+      Convert to CMov32(RM_Mem, dst) */
+   if (i->tag == Xin_CMov32
+       && i->Xin.CMov32.src->tag == Xrm_Reg
+       && i->Xin.CMov32.src->Xrm.Reg.reg == vreg) {
+      vassert(i->Xin.CMov32.dst != vreg);
+      return X86Instr_CMov32( 
+                i->Xin.CMov32.cond,
+                X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() )),
+                i->Xin.CMov32.dst
+             );
+   }
+
+   /* Deal with form: Test32(imm,RM_Reg vreg) -> Test32(imm,amode) */
+   if (i->tag == Xin_Test32
+       && i->Xin.Test32.dst->tag == Xrm_Reg
+       && i->Xin.Test32.dst->Xrm.Reg.reg == vreg) {
+      return X86Instr_Test32(
+                i->Xin.Test32.imm32,
+                X86RM_Mem( X86AMode_IR( spill_off, hregX86_EBP() ) )
+             );
+   }
+
+   return NULL;
+}
+
+
+/* --------- The x86 assembler (bleh.) --------- */
+
+static UChar iregNo ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcInt32);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 7);
+   return toUChar(n);
+}
+
+static UInt fregNo ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcFlt64);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 5);
+   return n;
+}
+
+static UInt vregNo ( HReg r )
+{
+   UInt n;
+   vassert(hregClass(r) == HRcVec128);
+   vassert(!hregIsVirtual(r));
+   n = hregNumber(r);
+   vassert(n <= 7);
+   return n;
+}
+
+static UChar mkModRegRM ( UChar mod, UChar reg, UChar regmem )
+{
+   return toUChar( ((mod & 3) << 6) 
+                   | ((reg & 7) << 3) 
+                   | (regmem & 7) );
+}
+
+static UChar mkSIB ( Int shift, Int regindex, Int regbase )
+{
+   return toUChar( ((shift & 3) << 6) 
+                   | ((regindex & 7) << 3) 
+                   | (regbase & 7) );
+}
+
+static UChar* emit32 ( UChar* p, UInt w32 )
+{
+   *p++ = toUChar( w32        & 0x000000FF);
+   *p++ = toUChar((w32 >>  8) & 0x000000FF);
+   *p++ = toUChar((w32 >> 16) & 0x000000FF);
+   *p++ = toUChar((w32 >> 24) & 0x000000FF);
+   return p;
+}
+
+/* Does a sign-extend of the lowest 8 bits give 
+   the original number? */
+static Bool fits8bits ( UInt w32 )
+{
+   Int i32 = (Int)w32;
+   return toBool(i32 == ((i32 << 24) >> 24));
+}
+
+
+/* Forming mod-reg-rm bytes and scale-index-base bytes.
+
+     greg,  0(ereg)    |  ereg != ESP && ereg != EBP
+                       =  00 greg ereg
+
+     greg,  d8(ereg)   |  ereg != ESP
+                       =  01 greg ereg, d8
+
+     greg,  d32(ereg)  |  ereg != ESP
+                       =  10 greg ereg, d32
+
+     greg,  d8(%esp)   =  01 greg 100, 0x24, d8
+
+     -----------------------------------------------
+
+     greg,  d8(base,index,scale)  
+               |  index != ESP
+               =  01 greg 100, scale index base, d8
+
+     greg,  d32(base,index,scale)
+               |  index != ESP
+               =  10 greg 100, scale index base, d32
+*/
+static UChar* doAMode_M ( UChar* p, HReg greg, X86AMode* am ) 
+{
+   if (am->tag == Xam_IR) {
+      if (am->Xam.IR.imm == 0 
+          && am->Xam.IR.reg != hregX86_ESP()
+          && am->Xam.IR.reg != hregX86_EBP() ) {
+         *p++ = mkModRegRM(0, iregNo(greg), iregNo(am->Xam.IR.reg));
+         return p;
+      }
+      if (fits8bits(am->Xam.IR.imm)
+          && am->Xam.IR.reg != hregX86_ESP()) {
+         *p++ = mkModRegRM(1, iregNo(greg), iregNo(am->Xam.IR.reg));
+         *p++ = toUChar(am->Xam.IR.imm & 0xFF);
+         return p;
+      }
+      if (am->Xam.IR.reg != hregX86_ESP()) {
+         *p++ = mkModRegRM(2, iregNo(greg), iregNo(am->Xam.IR.reg));
+         p = emit32(p, am->Xam.IR.imm);
+         return p;
+      }
+      if (am->Xam.IR.reg == hregX86_ESP()
+          && fits8bits(am->Xam.IR.imm)) {
+ 	 *p++ = mkModRegRM(1, iregNo(greg), 4);
+         *p++ = 0x24;
+         *p++ = toUChar(am->Xam.IR.imm & 0xFF);
+         return p;
+      }
+      ppX86AMode(am);
+      vpanic("doAMode_M: can't emit amode IR");
+      /*NOTREACHED*/
+   }
+   if (am->tag == Xam_IRRS) {
+      if (fits8bits(am->Xam.IRRS.imm)
+          && am->Xam.IRRS.index != hregX86_ESP()) {
+         *p++ = mkModRegRM(1, iregNo(greg), 4);
+         *p++ = mkSIB(am->Xam.IRRS.shift, am->Xam.IRRS.index, 
+                                          am->Xam.IRRS.base);
+         *p++ = toUChar(am->Xam.IRRS.imm & 0xFF);
+         return p;
+      }
+      if (am->Xam.IRRS.index != hregX86_ESP()) {
+         *p++ = mkModRegRM(2, iregNo(greg), 4);
+         *p++ = mkSIB(am->Xam.IRRS.shift, am->Xam.IRRS.index,
+                                          am->Xam.IRRS.base);
+         p = emit32(p, am->Xam.IRRS.imm);
+         return p;
+      }
+      ppX86AMode(am);
+      vpanic("doAMode_M: can't emit amode IRRS");
+      /*NOTREACHED*/
+   }
+   vpanic("doAMode_M: unknown amode");
+   /*NOTREACHED*/
+}
+
+
+/* Emit a mod-reg-rm byte when the rm bit denotes a reg. */
+static UChar* doAMode_R ( UChar* p, HReg greg, HReg ereg ) 
+{
+   *p++ = mkModRegRM(3, iregNo(greg), iregNo(ereg));
+   return p;
+}
+
+
+/* Emit ffree %st(7) */
+static UChar* do_ffree_st7 ( UChar* p )
+{
+   *p++ = 0xDD;
+   *p++ = 0xC7;
+   return p;
+}
+
+/* Emit fstp %st(i), 1 <= i <= 7 */
+static UChar* do_fstp_st ( UChar* p, Int i )
+{
+   vassert(1 <= i && i <= 7);
+   *p++ = 0xDD;
+   *p++ = toUChar(0xD8+i);
+   return p;
+}
+
+/* Emit fld %st(i), 0 <= i <= 6 */
+static UChar* do_fld_st ( UChar* p, Int i )
+{
+   vassert(0 <= i && i <= 6);
+   *p++ = 0xD9;
+   *p++ = toUChar(0xC0+i);
+   return p;
+}
+
+/* Emit f<op> %st(0) */
+static UChar* do_fop1_st ( UChar* p, X86FpOp op )
+{
+   switch (op) {
+      case Xfp_NEG:    *p++ = 0xD9; *p++ = 0xE0; break;
+      case Xfp_ABS:    *p++ = 0xD9; *p++ = 0xE1; break;
+      case Xfp_SQRT:   *p++ = 0xD9; *p++ = 0xFA; break;
+      case Xfp_ROUND:  *p++ = 0xD9; *p++ = 0xFC; break;
+      case Xfp_SIN:    *p++ = 0xD9; *p++ = 0xFE; break;
+      case Xfp_COS:    *p++ = 0xD9; *p++ = 0xFF; break;
+      case Xfp_2XM1:   *p++ = 0xD9; *p++ = 0xF0; break;
+      case Xfp_MOV:    break;
+      case Xfp_TAN:    p = do_ffree_st7(p); /* since fptan pushes 1.0 */
+                       *p++ = 0xD9; *p++ = 0xF2; /* fptan */
+                       *p++ = 0xD9; *p++ = 0xF7; /* fincstp */
+                       break;
+      default: vpanic("do_fop1_st: unknown op");
+   }
+   return p;
+}
+
+/* Emit f<op> %st(i), 1 <= i <= 5 */
+static UChar* do_fop2_st ( UChar* p, X86FpOp op, Int i )
+{
+#  define fake(_n) mkHReg((_n), HRcInt32, False)
+   Int subopc;
+   switch (op) {
+      case Xfp_ADD: subopc = 0; break;
+      case Xfp_SUB: subopc = 4; break;
+      case Xfp_MUL: subopc = 1; break;
+      case Xfp_DIV: subopc = 6; break;
+      default: vpanic("do_fop2_st: unknown op");
+   }
+   *p++ = 0xD8;
+   p    = doAMode_R(p, fake(subopc), fake(i));
+   return p;
+#  undef fake
+}
+
+/* Push a 32-bit word on the stack.  The word depends on tags[3:0];
+each byte is either 0x00 or 0xFF depending on the corresponding bit in tags[].
+*/
+static UChar* push_word_from_tags ( UChar* p, UShort tags )
+{
+   UInt w;
+   vassert(0 == (tags & ~0xF));
+   if (tags == 0) {
+      /* pushl $0x00000000 */
+      *p++ = 0x6A;
+      *p++ = 0x00;
+   }
+   else 
+   /* pushl $0xFFFFFFFF */
+   if (tags == 0xF) {
+      *p++ = 0x6A;
+      *p++ = 0xFF;
+   } else {
+      vassert(0); /* awaiting test case */
+      w = 0;
+      if (tags & 1) w |= 0x000000FF;
+      if (tags & 2) w |= 0x0000FF00;
+      if (tags & 4) w |= 0x00FF0000;
+      if (tags & 8) w |= 0xFF000000;
+      *p++ = 0x68;
+      p = emit32(p, w);
+   }
+   return p;
+}
+
+/* Emit an instruction into buf and return the number of bytes used.
+   Note that buf is not the insn's final place, and therefore it is
+   imperative to emit position-independent code. */
+
+Int emit_X86Instr ( UChar* buf, Int nbuf, X86Instr* i, 
+                    Bool mode64, void* dispatch )
+{
+   UInt irno, opc, opc_rr, subopc_imm, opc_imma, opc_cl, opc_imm, subopc;
+
+   UInt   xtra;
+   UChar* p = &buf[0];
+   UChar* ptmp;
+   vassert(nbuf >= 32);
+   vassert(mode64 == False);
+
+   /* Wrap an integer as a int register, for use assembling
+      GrpN insns, in which the greg field is used as a sub-opcode
+      and does not really contain a register. */
+#  define fake(_n) mkHReg((_n), HRcInt32, False)
+
+   /* vex_printf("asm  ");ppX86Instr(i, mode64); vex_printf("\n"); */
+
+   switch (i->tag) {
+
+   case Xin_Alu32R:
+      /* Deal specially with MOV */
+      if (i->Xin.Alu32R.op == Xalu_MOV) {
+         switch (i->Xin.Alu32R.src->tag) {
+            case Xrmi_Imm:
+               *p++ = toUChar(0xB8 + iregNo(i->Xin.Alu32R.dst));
+               p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
+               goto done;
+            case Xrmi_Reg:
+               *p++ = 0x89;
+               p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
+                                i->Xin.Alu32R.dst);
+               goto done;
+            case Xrmi_Mem:
+               *p++ = 0x8B;
+               p = doAMode_M(p, i->Xin.Alu32R.dst, 
+                                i->Xin.Alu32R.src->Xrmi.Mem.am);
+               goto done;
+            default:
+               goto bad;
+         }
+      }
+      /* MUL */
+      if (i->Xin.Alu32R.op == Xalu_MUL) {
+         switch (i->Xin.Alu32R.src->tag) {
+            case Xrmi_Reg:
+               *p++ = 0x0F;
+               *p++ = 0xAF;
+               p = doAMode_R(p, i->Xin.Alu32R.dst,
+                                i->Xin.Alu32R.src->Xrmi.Reg.reg);
+               goto done;
+            case Xrmi_Mem:
+               *p++ = 0x0F;
+               *p++ = 0xAF;
+               p = doAMode_M(p, i->Xin.Alu32R.dst,
+                                i->Xin.Alu32R.src->Xrmi.Mem.am);
+               goto done;
+            case Xrmi_Imm:
+               if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
+                  *p++ = 0x6B;
+                  p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
+                  *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
+               } else {
+                  *p++ = 0x69;
+                  p = doAMode_R(p, i->Xin.Alu32R.dst, i->Xin.Alu32R.dst);
+                  p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
+               }
+               goto done;
+            default:
+               goto bad;
+         }
+      }
+      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP */
+      opc = opc_rr = subopc_imm = opc_imma = 0;
+      switch (i->Xin.Alu32R.op) {
+         case Xalu_ADC: opc = 0x13; opc_rr = 0x11; 
+                        subopc_imm = 2; opc_imma = 0x15; break;
+         case Xalu_ADD: opc = 0x03; opc_rr = 0x01; 
+                        subopc_imm = 0; opc_imma = 0x05; break;
+         case Xalu_SUB: opc = 0x2B; opc_rr = 0x29; 
+                        subopc_imm = 5; opc_imma = 0x2D; break;
+         case Xalu_SBB: opc = 0x1B; opc_rr = 0x19; 
+                        subopc_imm = 3; opc_imma = 0x1D; break;
+         case Xalu_AND: opc = 0x23; opc_rr = 0x21; 
+                        subopc_imm = 4; opc_imma = 0x25; break;
+         case Xalu_XOR: opc = 0x33; opc_rr = 0x31; 
+                        subopc_imm = 6; opc_imma = 0x35; break;
+         case Xalu_OR:  opc = 0x0B; opc_rr = 0x09; 
+                        subopc_imm = 1; opc_imma = 0x0D; break;
+         case Xalu_CMP: opc = 0x3B; opc_rr = 0x39; 
+                        subopc_imm = 7; opc_imma = 0x3D; break;
+         default: goto bad;
+      }
+      switch (i->Xin.Alu32R.src->tag) {
+         case Xrmi_Imm:
+            if (i->Xin.Alu32R.dst == hregX86_EAX()
+                && !fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
+               *p++ = toUChar(opc_imma);
+               p = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
+            } else
+            if (fits8bits(i->Xin.Alu32R.src->Xrmi.Imm.imm32)) {
+               *p++ = 0x83; 
+               p    = doAMode_R(p, fake(subopc_imm), i->Xin.Alu32R.dst);
+               *p++ = toUChar(0xFF & i->Xin.Alu32R.src->Xrmi.Imm.imm32);
+            } else {
+               *p++ = 0x81; 
+               p    = doAMode_R(p, fake(subopc_imm), i->Xin.Alu32R.dst);
+               p    = emit32(p, i->Xin.Alu32R.src->Xrmi.Imm.imm32);
+            }
+            goto done;
+         case Xrmi_Reg:
+            *p++ = toUChar(opc_rr);
+            p = doAMode_R(p, i->Xin.Alu32R.src->Xrmi.Reg.reg,
+                             i->Xin.Alu32R.dst);
+            goto done;
+         case Xrmi_Mem:
+            *p++ = toUChar(opc);
+            p = doAMode_M(p, i->Xin.Alu32R.dst,
+                             i->Xin.Alu32R.src->Xrmi.Mem.am);
+            goto done;
+         default: 
+            goto bad;
+      }
+      break;
+
+   case Xin_Alu32M:
+      /* Deal specially with MOV */
+      if (i->Xin.Alu32M.op == Xalu_MOV) {
+         switch (i->Xin.Alu32M.src->tag) {
+            case Xri_Reg:
+               *p++ = 0x89;
+               p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
+                                i->Xin.Alu32M.dst);
+               goto done;
+            case Xri_Imm:
+               *p++ = 0xC7;
+               p = doAMode_M(p, fake(0), i->Xin.Alu32M.dst);
+               p = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
+               goto done;
+            default: 
+               goto bad;
+         }
+      }
+      /* ADD/SUB/ADC/SBB/AND/OR/XOR/CMP.  MUL is not
+         allowed here. */
+      opc = subopc_imm = opc_imma = 0;
+      switch (i->Xin.Alu32M.op) {
+         case Xalu_ADD: opc = 0x01; subopc_imm = 0; break;
+         case Xalu_SUB: opc = 0x29; subopc_imm = 5; break;
+         case Xalu_CMP: opc = 0x39; subopc_imm = 7; break;
+         default: goto bad;
+      }
+      switch (i->Xin.Alu32M.src->tag) {
+         case Xri_Reg:
+            *p++ = toUChar(opc);
+            p = doAMode_M(p, i->Xin.Alu32M.src->Xri.Reg.reg,
+                             i->Xin.Alu32M.dst);
+            goto done;
+         case Xri_Imm:
+            if (fits8bits(i->Xin.Alu32M.src->Xri.Imm.imm32)) {
+               *p++ = 0x83;
+               p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
+               *p++ = toUChar(0xFF & i->Xin.Alu32M.src->Xri.Imm.imm32);
+               goto done;
+            } else {
+               *p++ = 0x81;
+               p    = doAMode_M(p, fake(subopc_imm), i->Xin.Alu32M.dst);
+               p    = emit32(p, i->Xin.Alu32M.src->Xri.Imm.imm32);
+               goto done;
+            }
+         default: 
+            goto bad;
+      }
+      break;
+
+   case Xin_Sh32:
+      opc_cl = opc_imm = subopc = 0;
+      switch (i->Xin.Sh32.op) {
+         case Xsh_SHR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 5; break;
+         case Xsh_SAR: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 7; break;
+         case Xsh_SHL: opc_cl = 0xD3; opc_imm = 0xC1; subopc = 4; break;
+         default: goto bad;
+      }
+      if (i->Xin.Sh32.src == 0) {
+         *p++ = toUChar(opc_cl);
+         p = doAMode_R(p, fake(subopc), i->Xin.Sh32.dst);
+      } else {
+         *p++ = toUChar(opc_imm);
+         p = doAMode_R(p, fake(subopc), i->Xin.Sh32.dst);
+         *p++ = (UChar)(i->Xin.Sh32.src);
+      }
+      goto done;
+
+   case Xin_Test32:
+      if (i->Xin.Test32.dst->tag == Xrm_Reg) {
+         /* testl $imm32, %reg */
+         *p++ = 0xF7;
+         p = doAMode_R(p, fake(0), i->Xin.Test32.dst->Xrm.Reg.reg);
+         p = emit32(p, i->Xin.Test32.imm32);
+         goto done;
+      } else {
+         /* testl $imm32, amode */
+         *p++ = 0xF7;
+         p = doAMode_M(p, fake(0), i->Xin.Test32.dst->Xrm.Mem.am);
+         p = emit32(p, i->Xin.Test32.imm32);
+         goto done;
+      }
+
+   case Xin_Unary32:
+      if (i->Xin.Unary32.op == Xun_NOT) {
+         *p++ = 0xF7;
+         p = doAMode_R(p, fake(2), i->Xin.Unary32.dst);
+         goto done;
+      }
+      if (i->Xin.Unary32.op == Xun_NEG) {
+         *p++ = 0xF7;
+         p = doAMode_R(p, fake(3), i->Xin.Unary32.dst);
+         goto done;
+      }
+      break;
+
+   case Xin_Lea32:
+      *p++ = 0x8D;
+      p = doAMode_M(p, i->Xin.Lea32.dst, i->Xin.Lea32.am);
+      goto done;
+
+   case Xin_MulL:
+      subopc = i->Xin.MulL.syned ? 5 : 4;
+      *p++ = 0xF7;
+      switch (i->Xin.MulL.src->tag)  {
+         case Xrm_Mem:
+            p = doAMode_M(p, fake(subopc),
+                             i->Xin.MulL.src->Xrm.Mem.am);
+            goto done;
+         case Xrm_Reg:
+            p = doAMode_R(p, fake(subopc), 
+                             i->Xin.MulL.src->Xrm.Reg.reg);
+            goto done;
+         default:
+            goto bad;
+      }
+      break;
+
+   case Xin_Div:
+      subopc = i->Xin.Div.syned ? 7 : 6;
+      *p++ = 0xF7;
+      switch (i->Xin.Div.src->tag)  {
+         case Xrm_Mem:
+            p = doAMode_M(p, fake(subopc),
+                             i->Xin.Div.src->Xrm.Mem.am);
+            goto done;
+         case Xrm_Reg:
+            p = doAMode_R(p, fake(subopc), 
+                             i->Xin.Div.src->Xrm.Reg.reg);
+            goto done;
+         default:
+            goto bad;
+      }
+      break;
+
+   case Xin_Sh3232:
+      vassert(i->Xin.Sh3232.op == Xsh_SHL || i->Xin.Sh3232.op == Xsh_SHR);
+      if (i->Xin.Sh3232.amt == 0) {
+         /* shldl/shrdl by %cl */
+         *p++ = 0x0F;
+         if (i->Xin.Sh3232.op == Xsh_SHL) {
+            *p++ = 0xA5;
+         } else {
+            *p++ = 0xAD;
+         }
+         p = doAMode_R(p, i->Xin.Sh3232.src, i->Xin.Sh3232.dst);
+         goto done;
+      }
+      break;
+
+   case Xin_Push:
+      switch (i->Xin.Push.src->tag) {
+         case Xrmi_Mem: 
+            *p++ = 0xFF;
+            p = doAMode_M(p, fake(6), i->Xin.Push.src->Xrmi.Mem.am);
+            goto done;
+         case Xrmi_Imm:
+            *p++ = 0x68;
+            p = emit32(p, i->Xin.Push.src->Xrmi.Imm.imm32);
+            goto done;
+         case Xrmi_Reg:
+            *p++ = toUChar(0x50 + iregNo(i->Xin.Push.src->Xrmi.Reg.reg));
+            goto done;
+        default: 
+            goto bad;
+      }
+
+   case Xin_Call:
+      /* See detailed comment for Xin_Call in getRegUsage_X86Instr above
+         for explanation of this. */
+      switch (i->Xin.Call.regparms) {
+         case 0: irno = iregNo(hregX86_EAX()); break;
+         case 1: irno = iregNo(hregX86_EDX()); break;
+         case 2: irno = iregNo(hregX86_ECX()); break;
+         case 3: irno = iregNo(hregX86_EDI()); break;
+         default: vpanic(" emit_X86Instr:call:regparms");
+      }
+      /* jump over the following two insns if the condition does not
+         hold */
+      if (i->Xin.Call.cond != Xcc_ALWAYS) {
+         *p++ = toUChar(0x70 + (0xF & (i->Xin.Call.cond ^ 1)));
+         *p++ = 0x07; /* 7 bytes in the next two insns */
+      }
+      /* movl $target, %tmp */
+      *p++ = toUChar(0xB8 + irno);
+      p = emit32(p, i->Xin.Call.target);
+      /* call *%tmp */
+      *p++ = 0xFF;
+      *p++ = toUChar(0xD0 + irno);
+      goto done;
+
+   case Xin_Goto:
+      /* Use ptmp for backpatching conditional jumps. */
+      ptmp = NULL;
+
+      /* First off, if this is conditional, create a conditional
+	 jump over the rest of it. */
+      if (i->Xin.Goto.cond != Xcc_ALWAYS) {
+         /* jmp fwds if !condition */
+         *p++ = toUChar(0x70 + (0xF & (i->Xin.Goto.cond ^ 1)));
+         ptmp = p; /* fill in this bit later */
+         *p++ = 0; /* # of bytes to jump over; don't know how many yet. */
+      }
+
+      /* If a non-boring, set %ebp (the guest state pointer)
+         appropriately. */
+      /* movl $magic_number, %ebp */
+      switch (i->Xin.Goto.jk) {
+         case Ijk_ClientReq: 
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_CLIENTREQ); break;
+         case Ijk_Sys_int128:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_SYS_INT128); break;
+         case Ijk_Sys_int129:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_SYS_INT129); break;
+         case Ijk_Sys_int130:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_SYS_INT130); break;
+         case Ijk_Yield: 
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_YIELD); break;
+         case Ijk_EmWarn:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_EMWARN); break;
+         case Ijk_MapFail:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_MAPFAIL); break;
+         case Ijk_NoDecode:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_NODECODE); break;
+         case Ijk_TInval:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_TINVAL); break;
+         case Ijk_NoRedir:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_NOREDIR); break;
+         case Ijk_Sys_sysenter:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_SYS_SYSENTER); break;
+         case Ijk_SigTRAP:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_SIGTRAP); break;
+         case Ijk_SigSEGV:
+            *p++ = 0xBD;
+            p = emit32(p, VEX_TRC_JMP_SIGSEGV); break;
+         case Ijk_Ret:
+	 case Ijk_Call:
+         case Ijk_Boring:
+            break;
+         default: 
+            ppIRJumpKind(i->Xin.Goto.jk);
+            vpanic("emit_X86Instr.Xin_Goto: unknown jump kind");
+      }
+
+      /* Get the destination address into %eax */
+      if (i->Xin.Goto.dst->tag == Xri_Imm) {
+         /* movl $immediate, %eax */
+         *p++ = 0xB8;
+         p = emit32(p, i->Xin.Goto.dst->Xri.Imm.imm32);
+      } else {
+         vassert(i->Xin.Goto.dst->tag == Xri_Reg);
+         /* movl %reg, %eax */
+         if (i->Xin.Goto.dst->Xri.Reg.reg != hregX86_EAX()) {
+            *p++ = 0x89;
+            p = doAMode_R(p, i->Xin.Goto.dst->Xri.Reg.reg, hregX86_EAX());
+         }
+      }
+
+      /* Get the dispatcher address into %edx.  This has to happen
+         after the load of %eax since %edx might be carrying the value
+         destined for %eax immediately prior to this Xin_Goto. */
+      vassert(sizeof(UInt) == sizeof(void*));
+      vassert(dispatch != NULL);
+      /* movl $imm32, %edx */
+      *p++ = 0xBA;
+      p = emit32(p, (UInt)Ptr_to_ULong(dispatch));
+
+      /* jmp *%edx */
+      *p++ = 0xFF;
+      *p++ = 0xE2;
+
+      /* Fix up the conditional jump, if there was one. */
+      if (i->Xin.Goto.cond != Xcc_ALWAYS) {
+         Int delta = p - ptmp;
+	 vassert(delta > 0 && delta < 20);
+         *ptmp = toUChar(delta-1);
+      }
+      goto done;
+
+   case Xin_CMov32:
+      vassert(i->Xin.CMov32.cond != Xcc_ALWAYS);
+
+      /* This generates cmov, which is illegal on P54/P55. */
+      /*
+      *p++ = 0x0F;
+      *p++ = toUChar(0x40 + (0xF & i->Xin.CMov32.cond));
+      if (i->Xin.CMov32.src->tag == Xrm_Reg) {
+         p = doAMode_R(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Reg.reg);
+         goto done;
+      }
+      if (i->Xin.CMov32.src->tag == Xrm_Mem) {
+         p = doAMode_M(p, i->Xin.CMov32.dst, i->Xin.CMov32.src->Xrm.Mem.am);
+         goto done;
+      }
+      */
+
+      /* Alternative version which works on any x86 variant. */
+      /* jmp fwds if !condition */
+      *p++ = toUChar(0x70 + (i->Xin.CMov32.cond ^ 1));
+      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
+      ptmp = p;
+
+      switch (i->Xin.CMov32.src->tag) {
+         case Xrm_Reg:
+            /* Big sigh.  This is movl E -> G ... */
+            *p++ = 0x89;
+            p = doAMode_R(p, i->Xin.CMov32.src->Xrm.Reg.reg,
+                             i->Xin.CMov32.dst);
+
+            break;
+         case Xrm_Mem:
+            /* ... whereas this is movl G -> E.  That's why the args
+               to doAMode_R appear to be the wrong way round in the
+               Xrm_Reg case. */
+            *p++ = 0x8B;
+            p = doAMode_M(p, i->Xin.CMov32.dst,
+                             i->Xin.CMov32.src->Xrm.Mem.am);
+            break;
+         default:
+            goto bad;
+      }
+      /* Fill in the jump offset. */
+      *(ptmp-1) = toUChar(p - ptmp);
+      goto done;
+
+      break;
+
+   case Xin_LoadEX:
+      if (i->Xin.LoadEX.szSmall == 1 && !i->Xin.LoadEX.syned) {
+         /* movzbl */
+         *p++ = 0x0F;
+         *p++ = 0xB6;
+         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src); 
+         goto done;
+      }
+      if (i->Xin.LoadEX.szSmall == 2 && !i->Xin.LoadEX.syned) {
+         /* movzwl */
+         *p++ = 0x0F;
+         *p++ = 0xB7;
+         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src); 
+         goto done;
+      }
+      if (i->Xin.LoadEX.szSmall == 1 && i->Xin.LoadEX.syned) {
+         /* movsbl */
+         *p++ = 0x0F;
+         *p++ = 0xBE;
+         p = doAMode_M(p, i->Xin.LoadEX.dst, i->Xin.LoadEX.src); 
+         goto done;
+      }
+      break;
+
+   case Xin_Set32:
+      /* Make the destination register be 1 or 0, depending on whether
+         the relevant condition holds.  We have to dodge and weave
+         when the destination is %esi or %edi as we cannot directly
+         emit the native 'setb %reg' for those.  Further complication:
+         the top 24 bits of the destination should be forced to zero,
+         but doing 'xor %r,%r' kills the flag(s) we are about to read.
+         Sigh.  So start off my moving $0 into the dest. */
+
+      /* Do we need to swap in %eax? */
+      if (iregNo(i->Xin.Set32.dst) >= 4) {
+         /* xchg %eax, %dst */
+         *p++ = toUChar(0x90 + iregNo(i->Xin.Set32.dst));
+         /* movl $0, %eax */
+         *p++ =toUChar(0xB8 + iregNo(hregX86_EAX()));
+         p = emit32(p, 0);
+         /* setb lo8(%eax) */
+         *p++ = 0x0F; 
+         *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
+         p = doAMode_R(p, fake(0), hregX86_EAX());
+         /* xchg %eax, %dst */
+         *p++ = toUChar(0x90 + iregNo(i->Xin.Set32.dst));
+      } else {
+         /* movl $0, %dst */
+         *p++ = toUChar(0xB8 + iregNo(i->Xin.Set32.dst));
+         p = emit32(p, 0);
+         /* setb lo8(%dst) */
+         *p++ = 0x0F; 
+         *p++ = toUChar(0x90 + (0xF & i->Xin.Set32.cond));
+         p = doAMode_R(p, fake(0), i->Xin.Set32.dst);
+      }
+      goto done;
+
+   case Xin_Bsfr32:
+      *p++ = 0x0F;
+      if (i->Xin.Bsfr32.isFwds) {
+         *p++ = 0xBC;
+      } else {
+         *p++ = 0xBD;
+      }
+      p = doAMode_R(p, i->Xin.Bsfr32.dst, i->Xin.Bsfr32.src);
+      goto done;
+
+   case Xin_MFence:
+      /* see comment in hdefs.h re this insn */
+      if (0) vex_printf("EMIT FENCE\n");
+      if (i->Xin.MFence.hwcaps & (VEX_HWCAPS_X86_SSE3
+                                  |VEX_HWCAPS_X86_SSE2)) {
+         /* mfence */
+         *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF0;
+         goto done;
+      }
+      if (i->Xin.MFence.hwcaps & VEX_HWCAPS_X86_SSE1) {
+         /* sfence */
+         *p++ = 0x0F; *p++ = 0xAE; *p++ = 0xF8;
+         /* lock addl $0,0(%esp) */
+         *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44; 
+         *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
+         goto done;
+      }
+      if (i->Xin.MFence.hwcaps == 0/*baseline, no SSE*/) {
+         /* lock addl $0,0(%esp) */
+         *p++ = 0xF0; *p++ = 0x83; *p++ = 0x44; 
+         *p++ = 0x24; *p++ = 0x00; *p++ = 0x00;
+         goto done;
+      }
+      vpanic("emit_X86Instr:mfence:hwcaps");
+      /*NOTREACHED*/
+      break;
+
+   case Xin_ACAS:
+      /* lock */
+      *p++ = 0xF0;
+      /* cmpxchg{b,w,l} %ebx,mem.  Expected-value in %eax, new value
+         in %ebx.  The new-value register is hardwired to be %ebx
+         since letting it be any integer register gives the problem
+         that %sil and %dil are unaddressible on x86 and hence we
+         would have to resort to the same kind of trickery as with
+         byte-sized Xin.Store, just below.  Given that this isn't
+         performance critical, it is simpler just to force the
+         register operand to %ebx (could equally be %ecx or %edx).
+         (Although %ebx is more consistent with cmpxchg8b.) */
+      if (i->Xin.ACAS.sz == 2) *p++ = 0x66; 
+      *p++ = 0x0F;
+      if (i->Xin.ACAS.sz == 1) *p++ = 0xB0; else *p++ = 0xB1;
+      p = doAMode_M(p, hregX86_EBX(), i->Xin.ACAS.addr);
+      goto done;
+
+   case Xin_DACAS:
+      /* lock */
+      *p++ = 0xF0;
+      /* cmpxchg8b m64.  Expected-value in %edx:%eax, new value
+         in %ecx:%ebx.  All 4 regs are hardwired in the ISA, so
+         aren't encoded in the insn. */
+      *p++ = 0x0F;
+      *p++ = 0xC7;
+      p = doAMode_M(p, fake(1), i->Xin.DACAS.addr);
+      goto done;
+
+   case Xin_Store:
+      if (i->Xin.Store.sz == 2) {
+         /* This case, at least, is simple, given that we can
+            reference the low 16 bits of any integer register. */
+         *p++ = 0x66;
+         *p++ = 0x89;
+         p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
+         goto done;
+      }
+
+      if (i->Xin.Store.sz == 1) {
+         /* We have to do complex dodging and weaving if src is not
+            the low 8 bits of %eax/%ebx/%ecx/%edx. */
+         if (iregNo(i->Xin.Store.src) < 4) {
+            /* we're OK, can do it directly */
+            *p++ = 0x88;
+            p = doAMode_M(p, i->Xin.Store.src, i->Xin.Store.dst);
+           goto done;
+         } else {
+            /* Bleh.  This means the source is %edi or %esi.  Since
+               the address mode can only mention three registers, at
+               least one of %eax/%ebx/%ecx/%edx must be available to
+               temporarily swap the source into, so the store can
+               happen.  So we have to look at the regs mentioned
+               in the amode. */
+            HReg swap = INVALID_HREG;
+            HReg  eax = hregX86_EAX(), ebx = hregX86_EBX(), 
+                  ecx = hregX86_ECX(), edx = hregX86_EDX();
+            Bool a_ok = True, b_ok = True, c_ok = True, d_ok = True;
+            HRegUsage u;
+            Int j;
+            initHRegUsage(&u);
+            addRegUsage_X86AMode(&u,  i->Xin.Store.dst);
+            for (j = 0; j < u.n_used; j++) {
+               HReg r = u.hreg[j];
+               if (r == eax) a_ok = False;
+               if (r == ebx) b_ok = False;
+               if (r == ecx) c_ok = False;
+               if (r == edx) d_ok = False;
+            }
+            if (a_ok) swap = eax;
+            if (b_ok) swap = ebx;
+            if (c_ok) swap = ecx;
+            if (d_ok) swap = edx;
+            vassert(swap != INVALID_HREG);
+            /* xchgl %source, %swap. Could do better if swap is %eax. */
+            *p++ = 0x87;
+            p = doAMode_R(p, i->Xin.Store.src, swap);
+            /* movb lo8{%swap}, (dst) */
+            *p++ = 0x88;
+            p = doAMode_M(p, swap, i->Xin.Store.dst);
+            /* xchgl %source, %swap. Could do better if swap is %eax. */
+            *p++ = 0x87;
+            p = doAMode_R(p, i->Xin.Store.src, swap);
+            goto done;
+         }
+      } /* if (i->Xin.Store.sz == 1) */
+      break;
+
+   case Xin_FpUnary:
+      /* gop %src, %dst
+         --> ffree %st7 ; fld %st(src) ; fop %st(0) ; fstp %st(1+dst)
+      */
+      p = do_ffree_st7(p);
+      p = do_fld_st(p, 0+hregNumber(i->Xin.FpUnary.src));
+      p = do_fop1_st(p, i->Xin.FpUnary.op);
+      p = do_fstp_st(p, 1+hregNumber(i->Xin.FpUnary.dst));
+      goto done;
+
+   case Xin_FpBinary:
+      if (i->Xin.FpBinary.op == Xfp_YL2X
+          || i->Xin.FpBinary.op == Xfp_YL2XP1) {
+         /* Have to do this specially. */
+         /* ffree %st7 ; fld %st(srcL) ; 
+            ffree %st7 ; fld %st(srcR+1) ; fyl2x{p1} ; fstp(1+dst) */
+         p = do_ffree_st7(p);
+         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
+         p = do_ffree_st7(p);
+         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
+         *p++ = 0xD9; 
+         *p++ = toUChar(i->Xin.FpBinary.op==Xfp_YL2X ? 0xF1 : 0xF9);
+         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
+         goto done;
+      }
+      if (i->Xin.FpBinary.op == Xfp_ATAN) {
+         /* Have to do this specially. */
+         /* ffree %st7 ; fld %st(srcL) ; 
+            ffree %st7 ; fld %st(srcR+1) ; fpatan ; fstp(1+dst) */
+         p = do_ffree_st7(p);
+         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
+         p = do_ffree_st7(p);
+         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcR));
+         *p++ = 0xD9; *p++ = 0xF3;
+         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
+         goto done;
+      }
+      if (i->Xin.FpBinary.op == Xfp_PREM
+          || i->Xin.FpBinary.op == Xfp_PREM1
+          || i->Xin.FpBinary.op == Xfp_SCALE) {
+         /* Have to do this specially. */
+         /* ffree %st7 ; fld %st(srcR) ; 
+            ffree %st7 ; fld %st(srcL+1) ; fprem/fprem1/fscale ; fstp(2+dst) ; 
+            fincstp ; ffree %st7 */
+         p = do_ffree_st7(p);
+         p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcR));
+         p = do_ffree_st7(p);
+         p = do_fld_st(p, 1+hregNumber(i->Xin.FpBinary.srcL));
+         *p++ = 0xD9;
+         switch (i->Xin.FpBinary.op) {
+            case Xfp_PREM: *p++ = 0xF8; break;
+            case Xfp_PREM1: *p++ = 0xF5; break;
+            case Xfp_SCALE: *p++ =  0xFD; break;
+            default: vpanic("emitX86Instr(FpBinary,PREM/PREM1/SCALE)");
+         }
+         p = do_fstp_st(p, 2+hregNumber(i->Xin.FpBinary.dst));
+         *p++ = 0xD9; *p++ = 0xF7;
+         p = do_ffree_st7(p);
+         goto done;
+      }
+      /* General case */
+      /* gop %srcL, %srcR, %dst
+         --> ffree %st7 ; fld %st(srcL) ; fop %st(1+srcR) ; fstp %st(1+dst)
+      */
+      p = do_ffree_st7(p);
+      p = do_fld_st(p, 0+hregNumber(i->Xin.FpBinary.srcL));
+      p = do_fop2_st(p, i->Xin.FpBinary.op, 
+                        1+hregNumber(i->Xin.FpBinary.srcR));
+      p = do_fstp_st(p, 1+hregNumber(i->Xin.FpBinary.dst));
+      goto done;
+
+   case Xin_FpLdSt:
+      if (i->Xin.FpLdSt.isLoad) {
+         /* Load from memory into %fakeN.  
+            --> ffree %st(7) ; fld{s/l/t} amode ; fstp st(N+1) 
+         */
+         p = do_ffree_st7(p);
+         switch (i->Xin.FpLdSt.sz) {
+            case 4:
+               *p++ = 0xD9;
+               p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
+               break;
+            case 8:
+               *p++ = 0xDD;
+               p = doAMode_M(p, fake(0)/*subopcode*/, i->Xin.FpLdSt.addr);
+               break;
+            case 10:
+               *p++ = 0xDB;
+               p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdSt.addr);
+               break;
+            default:
+               vpanic("emitX86Instr(FpLdSt,load)");
+         }
+         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdSt.reg));
+         goto done;
+      } else {
+         /* Store from %fakeN into memory.
+            --> ffree %st(7) ; fld st(N) ; fstp{l|s} amode
+	 */
+         p = do_ffree_st7(p);
+         p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdSt.reg));
+         switch (i->Xin.FpLdSt.sz) {
+            case 4:
+               *p++ = 0xD9;
+               p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
+               break;
+            case 8:
+               *p++ = 0xDD;
+               p = doAMode_M(p, fake(3)/*subopcode*/, i->Xin.FpLdSt.addr);
+               break;
+            case 10:
+               *p++ = 0xDB;
+               p = doAMode_M(p, fake(7)/*subopcode*/, i->Xin.FpLdSt.addr);
+               break;
+            default:
+               vpanic("emitX86Instr(FpLdSt,store)");
+         }
+         goto done;
+      }
+      break;
+
+   case Xin_FpLdStI:
+      if (i->Xin.FpLdStI.isLoad) {
+         /* Load from memory into %fakeN, converting from an int.  
+            --> ffree %st(7) ; fild{w/l/ll} amode ; fstp st(N+1) 
+         */
+         switch (i->Xin.FpLdStI.sz) {
+            case 8:  opc = 0xDF; subopc_imm = 5; break;
+            case 4:  opc = 0xDB; subopc_imm = 0; break;
+            case 2:  vassert(0); opc = 0xDF; subopc_imm = 0; break;
+            default: vpanic("emitX86Instr(Xin_FpLdStI-load)");
+         }
+         p = do_ffree_st7(p);
+         *p++ = toUChar(opc);
+         p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
+         p = do_fstp_st(p, 1+hregNumber(i->Xin.FpLdStI.reg));
+         goto done;
+      } else {
+         /* Store from %fakeN into memory, converting to an int.
+            --> ffree %st(7) ; fld st(N) ; fistp{w/l/ll} amode
+	 */
+         switch (i->Xin.FpLdStI.sz) {
+            case 8:  opc = 0xDF; subopc_imm = 7; break;
+            case 4:  opc = 0xDB; subopc_imm = 3; break;
+            case 2:  opc = 0xDF; subopc_imm = 3; break;
+            default: vpanic("emitX86Instr(Xin_FpLdStI-store)");
+         }
+         p = do_ffree_st7(p);
+         p = do_fld_st(p, 0+hregNumber(i->Xin.FpLdStI.reg));
+         *p++ = toUChar(opc);
+         p = doAMode_M(p, fake(subopc_imm)/*subopcode*/, i->Xin.FpLdStI.addr);
+         goto done;
+      }
+      break;
+
+   case Xin_Fp64to32:
+      /* ffree %st7 ; fld %st(src) */
+      p = do_ffree_st7(p);
+      p = do_fld_st(p, 0+fregNo(i->Xin.Fp64to32.src));
+      /* subl $4, %esp */
+      *p++ = 0x83; *p++ = 0xEC; *p++ = 0x04;
+      /* fstps (%esp) */
+      *p++ = 0xD9; *p++ = 0x1C; *p++ = 0x24;
+      /* flds (%esp) */
+      *p++ = 0xD9; *p++ = 0x04; *p++ = 0x24;
+      /* addl $4, %esp */
+      *p++ = 0x83; *p++ = 0xC4; *p++ = 0x04;
+      /* fstp %st(1+dst) */
+      p = do_fstp_st(p, 1+fregNo(i->Xin.Fp64to32.dst));
+      goto done;
+
+   case Xin_FpCMov:
+      /* jmp fwds if !condition */
+      *p++ = toUChar(0x70 + (i->Xin.FpCMov.cond ^ 1));
+      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
+      ptmp = p;
+
+      /* ffree %st7 ; fld %st(src) ; fstp %st(1+dst) */
+      p = do_ffree_st7(p);
+      p = do_fld_st(p, 0+fregNo(i->Xin.FpCMov.src));
+      p = do_fstp_st(p, 1+fregNo(i->Xin.FpCMov.dst));
+
+      /* Fill in the jump offset. */
+      *(ptmp-1) = toUChar(p - ptmp);
+      goto done;
+
+   case Xin_FpLdCW:
+      *p++ = 0xD9;
+      p = doAMode_M(p, fake(5)/*subopcode*/, i->Xin.FpLdCW.addr);
+      goto done;
+
+   case Xin_FpStSW_AX:
+      /* note, this emits fnstsw %ax, not fstsw %ax */
+      *p++ = 0xDF;
+      *p++ = 0xE0;
+      goto done;
+
+   case Xin_FpCmp:
+      /* gcmp %fL, %fR, %dst
+         -> ffree %st7; fpush %fL ; fucomp %(fR+1) ; 
+            fnstsw %ax ; movl %eax, %dst 
+      */
+      /* ffree %st7 */
+      p = do_ffree_st7(p);
+      /* fpush %fL */
+      p = do_fld_st(p, 0+fregNo(i->Xin.FpCmp.srcL));
+      /* fucomp %(fR+1) */
+      *p++ = 0xDD;
+      *p++ = toUChar(0xE8 + (7 & (1+fregNo(i->Xin.FpCmp.srcR))));
+      /* fnstsw %ax */
+      *p++ = 0xDF;
+      *p++ = 0xE0;
+      /*  movl %eax, %dst */
+      *p++ = 0x89;
+      p = doAMode_R(p, hregX86_EAX(), i->Xin.FpCmp.dst);
+      goto done;
+
+   case Xin_SseConst: {
+      UShort con = i->Xin.SseConst.con;
+      p = push_word_from_tags(p, toUShort((con >> 12) & 0xF));
+      p = push_word_from_tags(p, toUShort((con >> 8) & 0xF));
+      p = push_word_from_tags(p, toUShort((con >> 4) & 0xF));
+      p = push_word_from_tags(p, toUShort(con & 0xF));
+      /* movl (%esp), %xmm-dst */
+      *p++ = 0x0F;
+      *p++ = 0x10;
+      *p++ = toUChar(0x04 + 8 * (7 & vregNo(i->Xin.SseConst.dst)));
+      *p++ = 0x24;
+      /* addl $16, %esp */
+      *p++ = 0x83;
+      *p++ = 0xC4;
+      *p++ = 0x10;
+      goto done;
+   }
+
+   case Xin_SseLdSt:
+      *p++ = 0x0F; 
+      *p++ = toUChar(i->Xin.SseLdSt.isLoad ? 0x10 : 0x11);
+      p = doAMode_M(p, fake(vregNo(i->Xin.SseLdSt.reg)), i->Xin.SseLdSt.addr);
+      goto done;
+
+   case Xin_SseLdzLO:
+      vassert(i->Xin.SseLdzLO.sz == 4 || i->Xin.SseLdzLO.sz == 8);
+      /* movs[sd] amode, %xmm-dst */
+      *p++ = toUChar(i->Xin.SseLdzLO.sz==4 ? 0xF3 : 0xF2);
+      *p++ = 0x0F; 
+      *p++ = 0x10; 
+      p = doAMode_M(p, fake(vregNo(i->Xin.SseLdzLO.reg)), 
+                       i->Xin.SseLdzLO.addr);
+      goto done;
+
+   case Xin_Sse32Fx4:
+      xtra = 0;
+      *p++ = 0x0F;
+      switch (i->Xin.Sse32Fx4.op) {
+         case Xsse_ADDF:   *p++ = 0x58; break;
+         case Xsse_DIVF:   *p++ = 0x5E; break;
+         case Xsse_MAXF:   *p++ = 0x5F; break;
+         case Xsse_MINF:   *p++ = 0x5D; break;
+         case Xsse_MULF:   *p++ = 0x59; break;
+         case Xsse_RCPF:   *p++ = 0x53; break;
+         case Xsse_RSQRTF: *p++ = 0x52; break;
+         case Xsse_SQRTF:  *p++ = 0x51; break;
+         case Xsse_SUBF:   *p++ = 0x5C; break;
+         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
+         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
+         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
+         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
+         default: goto bad;
+      }
+      p = doAMode_R(p, fake(vregNo(i->Xin.Sse32Fx4.dst)),
+                       fake(vregNo(i->Xin.Sse32Fx4.src)) );
+      if (xtra & 0x100)
+         *p++ = toUChar(xtra & 0xFF);
+      goto done;
+
+   case Xin_Sse64Fx2:
+      xtra = 0;
+      *p++ = 0x66;
+      *p++ = 0x0F;
+      switch (i->Xin.Sse64Fx2.op) {
+         case Xsse_ADDF:   *p++ = 0x58; break;
+         case Xsse_DIVF:   *p++ = 0x5E; break;
+         case Xsse_MAXF:   *p++ = 0x5F; break;
+         case Xsse_MINF:   *p++ = 0x5D; break;
+         case Xsse_MULF:   *p++ = 0x59; break;
+         case Xsse_RCPF:   *p++ = 0x53; break;
+         case Xsse_RSQRTF: *p++ = 0x52; break;
+         case Xsse_SQRTF:  *p++ = 0x51; break;
+         case Xsse_SUBF:   *p++ = 0x5C; break;
+         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
+         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
+         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
+         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
+         default: goto bad;
+      }
+      p = doAMode_R(p, fake(vregNo(i->Xin.Sse64Fx2.dst)),
+                       fake(vregNo(i->Xin.Sse64Fx2.src)) );
+      if (xtra & 0x100)
+         *p++ = toUChar(xtra & 0xFF);
+      goto done;
+
+   case Xin_Sse32FLo:
+      xtra = 0;
+      *p++ = 0xF3;
+      *p++ = 0x0F;
+      switch (i->Xin.Sse32FLo.op) {
+         case Xsse_ADDF:   *p++ = 0x58; break;
+         case Xsse_DIVF:   *p++ = 0x5E; break;
+         case Xsse_MAXF:   *p++ = 0x5F; break;
+         case Xsse_MINF:   *p++ = 0x5D; break;
+         case Xsse_MULF:   *p++ = 0x59; break;
+         case Xsse_RCPF:   *p++ = 0x53; break;
+         case Xsse_RSQRTF: *p++ = 0x52; break;
+         case Xsse_SQRTF:  *p++ = 0x51; break;
+         case Xsse_SUBF:   *p++ = 0x5C; break;
+         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
+         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
+         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
+         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
+         default: goto bad;
+      }
+      p = doAMode_R(p, fake(vregNo(i->Xin.Sse32FLo.dst)),
+                       fake(vregNo(i->Xin.Sse32FLo.src)) );
+      if (xtra & 0x100)
+         *p++ = toUChar(xtra & 0xFF);
+      goto done;
+
+   case Xin_Sse64FLo:
+      xtra = 0;
+      *p++ = 0xF2;
+      *p++ = 0x0F;
+      switch (i->Xin.Sse64FLo.op) {
+         case Xsse_ADDF:   *p++ = 0x58; break;
+         case Xsse_DIVF:   *p++ = 0x5E; break;
+         case Xsse_MAXF:   *p++ = 0x5F; break;
+         case Xsse_MINF:   *p++ = 0x5D; break;
+         case Xsse_MULF:   *p++ = 0x59; break;
+         case Xsse_RCPF:   *p++ = 0x53; break;
+         case Xsse_RSQRTF: *p++ = 0x52; break;
+         case Xsse_SQRTF:  *p++ = 0x51; break;
+         case Xsse_SUBF:   *p++ = 0x5C; break;
+         case Xsse_CMPEQF: *p++ = 0xC2; xtra = 0x100; break;
+         case Xsse_CMPLTF: *p++ = 0xC2; xtra = 0x101; break;
+         case Xsse_CMPLEF: *p++ = 0xC2; xtra = 0x102; break;
+         case Xsse_CMPUNF: *p++ = 0xC2; xtra = 0x103; break;
+         default: goto bad;
+      }
+      p = doAMode_R(p, fake(vregNo(i->Xin.Sse64FLo.dst)),
+                       fake(vregNo(i->Xin.Sse64FLo.src)) );
+      if (xtra & 0x100)
+         *p++ = toUChar(xtra & 0xFF);
+      goto done;
+
+   case Xin_SseReRg:
+#     define XX(_n) *p++ = (_n)
+      switch (i->Xin.SseReRg.op) {
+         case Xsse_MOV:     /*movups*/ XX(0x0F); XX(0x10); break;
+         case Xsse_OR:                 XX(0x0F); XX(0x56); break;
+         case Xsse_XOR:                XX(0x0F); XX(0x57); break;
+         case Xsse_AND:                XX(0x0F); XX(0x54); break;
+         case Xsse_PACKSSD:  XX(0x66); XX(0x0F); XX(0x6B); break;
+         case Xsse_PACKSSW:  XX(0x66); XX(0x0F); XX(0x63); break;
+         case Xsse_PACKUSW:  XX(0x66); XX(0x0F); XX(0x67); break;
+         case Xsse_ADD8:     XX(0x66); XX(0x0F); XX(0xFC); break;
+         case Xsse_ADD16:    XX(0x66); XX(0x0F); XX(0xFD); break;
+         case Xsse_ADD32:    XX(0x66); XX(0x0F); XX(0xFE); break;
+         case Xsse_ADD64:    XX(0x66); XX(0x0F); XX(0xD4); break;
+         case Xsse_QADD8S:   XX(0x66); XX(0x0F); XX(0xEC); break;
+         case Xsse_QADD16S:  XX(0x66); XX(0x0F); XX(0xED); break;
+         case Xsse_QADD8U:   XX(0x66); XX(0x0F); XX(0xDC); break;
+         case Xsse_QADD16U:  XX(0x66); XX(0x0F); XX(0xDD); break;
+         case Xsse_AVG8U:    XX(0x66); XX(0x0F); XX(0xE0); break;
+         case Xsse_AVG16U:   XX(0x66); XX(0x0F); XX(0xE3); break;
+         case Xsse_CMPEQ8:   XX(0x66); XX(0x0F); XX(0x74); break;
+         case Xsse_CMPEQ16:  XX(0x66); XX(0x0F); XX(0x75); break;
+         case Xsse_CMPEQ32:  XX(0x66); XX(0x0F); XX(0x76); break;
+         case Xsse_CMPGT8S:  XX(0x66); XX(0x0F); XX(0x64); break;
+         case Xsse_CMPGT16S: XX(0x66); XX(0x0F); XX(0x65); break;
+         case Xsse_CMPGT32S: XX(0x66); XX(0x0F); XX(0x66); break;
+         case Xsse_MAX16S:   XX(0x66); XX(0x0F); XX(0xEE); break;
+         case Xsse_MAX8U:    XX(0x66); XX(0x0F); XX(0xDE); break;
+         case Xsse_MIN16S:   XX(0x66); XX(0x0F); XX(0xEA); break;
+         case Xsse_MIN8U:    XX(0x66); XX(0x0F); XX(0xDA); break;
+         case Xsse_MULHI16U: XX(0x66); XX(0x0F); XX(0xE4); break;
+         case Xsse_MULHI16S: XX(0x66); XX(0x0F); XX(0xE5); break;
+         case Xsse_MUL16:    XX(0x66); XX(0x0F); XX(0xD5); break;
+         case Xsse_SHL16:    XX(0x66); XX(0x0F); XX(0xF1); break;
+         case Xsse_SHL32:    XX(0x66); XX(0x0F); XX(0xF2); break;
+         case Xsse_SHL64:    XX(0x66); XX(0x0F); XX(0xF3); break;
+         case Xsse_SAR16:    XX(0x66); XX(0x0F); XX(0xE1); break;
+         case Xsse_SAR32:    XX(0x66); XX(0x0F); XX(0xE2); break;
+         case Xsse_SHR16:    XX(0x66); XX(0x0F); XX(0xD1); break;
+         case Xsse_SHR32:    XX(0x66); XX(0x0F); XX(0xD2); break;
+         case Xsse_SHR64:    XX(0x66); XX(0x0F); XX(0xD3); break;
+         case Xsse_SUB8:     XX(0x66); XX(0x0F); XX(0xF8); break;
+         case Xsse_SUB16:    XX(0x66); XX(0x0F); XX(0xF9); break;
+         case Xsse_SUB32:    XX(0x66); XX(0x0F); XX(0xFA); break;
+         case Xsse_SUB64:    XX(0x66); XX(0x0F); XX(0xFB); break;
+         case Xsse_QSUB8S:   XX(0x66); XX(0x0F); XX(0xE8); break;
+         case Xsse_QSUB16S:  XX(0x66); XX(0x0F); XX(0xE9); break;
+         case Xsse_QSUB8U:   XX(0x66); XX(0x0F); XX(0xD8); break;
+         case Xsse_QSUB16U:  XX(0x66); XX(0x0F); XX(0xD9); break;
+         case Xsse_UNPCKHB:  XX(0x66); XX(0x0F); XX(0x68); break;
+         case Xsse_UNPCKHW:  XX(0x66); XX(0x0F); XX(0x69); break;
+         case Xsse_UNPCKHD:  XX(0x66); XX(0x0F); XX(0x6A); break;
+         case Xsse_UNPCKHQ:  XX(0x66); XX(0x0F); XX(0x6D); break;
+         case Xsse_UNPCKLB:  XX(0x66); XX(0x0F); XX(0x60); break;
+         case Xsse_UNPCKLW:  XX(0x66); XX(0x0F); XX(0x61); break;
+         case Xsse_UNPCKLD:  XX(0x66); XX(0x0F); XX(0x62); break;
+         case Xsse_UNPCKLQ:  XX(0x66); XX(0x0F); XX(0x6C); break;
+         default: goto bad;
+      }
+      p = doAMode_R(p, fake(vregNo(i->Xin.SseReRg.dst)),
+                       fake(vregNo(i->Xin.SseReRg.src)) );
+#     undef XX
+      goto done;
+
+   case Xin_SseCMov:
+      /* jmp fwds if !condition */
+      *p++ = toUChar(0x70 + (i->Xin.SseCMov.cond ^ 1));
+      *p++ = 0; /* # of bytes in the next bit, which we don't know yet */
+      ptmp = p;
+
+      /* movaps %src, %dst */
+      *p++ = 0x0F; 
+      *p++ = 0x28; 
+      p = doAMode_R(p, fake(vregNo(i->Xin.SseCMov.dst)),
+                       fake(vregNo(i->Xin.SseCMov.src)) );
+
+      /* Fill in the jump offset. */
+      *(ptmp-1) = toUChar(p - ptmp);
+      goto done;
+
+   case Xin_SseShuf:
+      *p++ = 0x66; 
+      *p++ = 0x0F; 
+      *p++ = 0x70; 
+      p = doAMode_R(p, fake(vregNo(i->Xin.SseShuf.dst)),
+                       fake(vregNo(i->Xin.SseShuf.src)) );
+      *p++ = (UChar)(i->Xin.SseShuf.order);
+      goto done;
+
+   default: 
+      goto bad;
+   }
+
+  bad:
+   ppX86Instr(i, mode64);
+   vpanic("emit_X86Instr");
+   /*NOTREACHED*/
+   
+  done:
+   vassert(p - &buf[0] <= 32);
+   return p - &buf[0];
+
+#  undef fake
+}
+
+/*---------------------------------------------------------------*/
+/*--- end                                     host_x86_defs.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_x86_defs.h b/VEX/priv/host_x86_defs.h
new file mode 100644
index 0000000..fde700a
--- /dev/null
+++ b/VEX/priv/host_x86_defs.h

@@ -0,0 +1,694 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                   host_x86_defs.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#ifndef __VEX_HOST_X86_DEFS_H
+#define __VEX_HOST_X86_DEFS_H
+
+
+/* --------- Registers. --------- */
+
+/* The usual HReg abstraction.  There are 8 real int regs,
+   6 real float regs, and 8 real vector regs. 
+*/
+
+extern void ppHRegX86 ( HReg );
+
+extern HReg hregX86_EAX ( void );
+extern HReg hregX86_EBX ( void );
+extern HReg hregX86_ECX ( void );
+extern HReg hregX86_EDX ( void );
+extern HReg hregX86_ESP ( void );
+extern HReg hregX86_EBP ( void );
+extern HReg hregX86_ESI ( void );
+extern HReg hregX86_EDI ( void );
+
+extern HReg hregX86_FAKE0 ( void );
+extern HReg hregX86_FAKE1 ( void );
+extern HReg hregX86_FAKE2 ( void );
+extern HReg hregX86_FAKE3 ( void );
+extern HReg hregX86_FAKE4 ( void );
+extern HReg hregX86_FAKE5 ( void );
+
+extern HReg hregX86_XMM0 ( void );
+extern HReg hregX86_XMM1 ( void );
+extern HReg hregX86_XMM2 ( void );
+extern HReg hregX86_XMM3 ( void );
+extern HReg hregX86_XMM4 ( void );
+extern HReg hregX86_XMM5 ( void );
+extern HReg hregX86_XMM6 ( void );
+extern HReg hregX86_XMM7 ( void );
+
+
+/* --------- Condition codes, Intel encoding. --------- */
+
+typedef
+   enum {
+      Xcc_O      = 0,  /* overflow           */
+      Xcc_NO     = 1,  /* no overflow        */
+
+      Xcc_B      = 2,  /* below              */
+      Xcc_NB     = 3,  /* not below          */
+
+      Xcc_Z      = 4,  /* zero               */
+      Xcc_NZ     = 5,  /* not zero           */
+
+      Xcc_BE     = 6,  /* below or equal     */
+      Xcc_NBE    = 7,  /* not below or equal */
+
+      Xcc_S      = 8,  /* negative           */
+      Xcc_NS     = 9,  /* not negative       */
+
+      Xcc_P      = 10, /* parity even        */
+      Xcc_NP     = 11, /* not parity even    */
+
+      Xcc_L      = 12, /* jump less          */
+      Xcc_NL     = 13, /* not less           */
+
+      Xcc_LE     = 14, /* less or equal      */
+      Xcc_NLE    = 15, /* not less or equal  */
+
+      Xcc_ALWAYS = 16  /* the usual hack     */
+   }
+   X86CondCode;
+
+extern HChar* showX86CondCode ( X86CondCode );
+
+
+/* --------- Memory address expressions (amodes). --------- */
+
+typedef
+   enum {
+     Xam_IR,        /* Immediate + Reg */
+     Xam_IRRS       /* Immediate + Reg1 + (Reg2 << Shift) */
+   }
+   X86AModeTag;
+
+typedef
+   struct {
+      X86AModeTag tag;
+      union {
+         struct {
+            UInt imm;
+            HReg reg;
+         } IR;
+         struct {
+            UInt imm;
+            HReg base;
+            HReg index;
+            Int  shift; /* 0, 1, 2 or 3 only */
+         } IRRS;
+      } Xam;
+   }
+   X86AMode;
+
+extern X86AMode* X86AMode_IR   ( UInt, HReg );
+extern X86AMode* X86AMode_IRRS ( UInt, HReg, HReg, Int );
+
+extern X86AMode* dopyX86AMode ( X86AMode* );
+
+extern void ppX86AMode ( X86AMode* );
+
+
+/* --------- Operand, which can be reg, immediate or memory. --------- */
+
+typedef 
+   enum {
+      Xrmi_Imm,
+      Xrmi_Reg,
+      Xrmi_Mem
+   }
+   X86RMITag;
+
+typedef
+   struct {
+      X86RMITag tag;
+      union {
+         struct {
+            UInt imm32;
+         } Imm;
+         struct {
+            HReg reg;
+         } Reg;
+         struct {
+            X86AMode* am;
+         } Mem;
+      }
+      Xrmi;
+   }
+   X86RMI;
+
+extern X86RMI* X86RMI_Imm ( UInt );
+extern X86RMI* X86RMI_Reg ( HReg );
+extern X86RMI* X86RMI_Mem ( X86AMode* );
+
+extern void ppX86RMI ( X86RMI* );
+
+
+/* --------- Operand, which can be reg or immediate only. --------- */
+
+typedef 
+   enum {
+      Xri_Imm,
+      Xri_Reg
+   }
+   X86RITag;
+
+typedef
+   struct {
+      X86RITag tag;
+      union {
+         struct {
+            UInt imm32;
+         } Imm;
+         struct {
+            HReg reg;
+         } Reg;
+      }
+      Xri;
+   }
+   X86RI;
+
+extern X86RI* X86RI_Imm ( UInt );
+extern X86RI* X86RI_Reg ( HReg );
+
+extern void ppX86RI ( X86RI* );
+
+
+/* --------- Operand, which can be reg or memory only. --------- */
+
+typedef 
+   enum {
+      Xrm_Reg,
+      Xrm_Mem
+   }
+   X86RMTag;
+
+typedef
+   struct {
+      X86RMTag tag;
+      union {
+         struct {
+            HReg reg;
+         } Reg;
+         struct {
+            X86AMode* am;
+         } Mem;
+      }
+      Xrm;
+   }
+   X86RM;
+
+extern X86RM* X86RM_Reg ( HReg );
+extern X86RM* X86RM_Mem ( X86AMode* );
+
+extern void ppX86RM ( X86RM* );
+
+
+/* --------- Instructions. --------- */
+
+/* --------- */
+typedef
+   enum {
+      Xun_NEG,
+      Xun_NOT
+   }
+   X86UnaryOp;
+
+extern HChar* showX86UnaryOp ( X86UnaryOp );
+
+
+/* --------- */
+typedef 
+   enum {
+      Xalu_INVALID,
+      Xalu_MOV,
+      Xalu_CMP,
+      Xalu_ADD, Xalu_SUB, Xalu_ADC, Xalu_SBB, 
+      Xalu_AND, Xalu_OR, Xalu_XOR,
+      Xalu_MUL
+   }
+   X86AluOp;
+
+extern HChar* showX86AluOp ( X86AluOp );
+
+
+/* --------- */
+typedef
+   enum {
+      Xsh_INVALID,
+      Xsh_SHL, Xsh_SHR, Xsh_SAR
+   }
+   X86ShiftOp;
+
+extern HChar* showX86ShiftOp ( X86ShiftOp );
+
+
+/* --------- */
+typedef
+   enum {
+      Xfp_INVALID,
+      /* Binary */
+      Xfp_ADD, Xfp_SUB, Xfp_MUL, Xfp_DIV, 
+      Xfp_SCALE, Xfp_ATAN, Xfp_YL2X, Xfp_YL2XP1, Xfp_PREM, Xfp_PREM1,
+      /* Unary */
+      Xfp_SQRT, Xfp_ABS, Xfp_NEG, Xfp_MOV, Xfp_SIN, Xfp_COS, Xfp_TAN,
+      Xfp_ROUND, Xfp_2XM1
+   }
+   X86FpOp;
+
+extern HChar* showX86FpOp ( X86FpOp );
+
+
+/* --------- */
+typedef
+   enum {
+      Xsse_INVALID,
+      /* mov */
+      Xsse_MOV,
+      /* Floating point binary */
+      Xsse_ADDF, Xsse_SUBF, Xsse_MULF, Xsse_DIVF,
+      Xsse_MAXF, Xsse_MINF,
+      Xsse_CMPEQF, Xsse_CMPLTF, Xsse_CMPLEF, Xsse_CMPUNF,
+      /* Floating point unary */
+      Xsse_RCPF, Xsse_RSQRTF, Xsse_SQRTF, 
+      /* Bitwise */
+      Xsse_AND, Xsse_OR, Xsse_XOR, Xsse_ANDN,
+      /* Integer binary */
+      Xsse_ADD8,   Xsse_ADD16,   Xsse_ADD32,   Xsse_ADD64,
+      Xsse_QADD8U, Xsse_QADD16U,
+      Xsse_QADD8S, Xsse_QADD16S,
+      Xsse_SUB8,   Xsse_SUB16,   Xsse_SUB32,   Xsse_SUB64,
+      Xsse_QSUB8U, Xsse_QSUB16U,
+      Xsse_QSUB8S, Xsse_QSUB16S,
+      Xsse_MUL16,
+      Xsse_MULHI16U,
+      Xsse_MULHI16S,
+      Xsse_AVG8U, Xsse_AVG16U,
+      Xsse_MAX16S,
+      Xsse_MAX8U,
+      Xsse_MIN16S,
+      Xsse_MIN8U,
+      Xsse_CMPEQ8,  Xsse_CMPEQ16,  Xsse_CMPEQ32,
+      Xsse_CMPGT8S, Xsse_CMPGT16S, Xsse_CMPGT32S,
+      Xsse_SHL16, Xsse_SHL32, Xsse_SHL64,
+      Xsse_SHR16, Xsse_SHR32, Xsse_SHR64,
+      Xsse_SAR16, Xsse_SAR32, 
+      Xsse_PACKSSD, Xsse_PACKSSW, Xsse_PACKUSW,
+      Xsse_UNPCKHB, Xsse_UNPCKHW, Xsse_UNPCKHD, Xsse_UNPCKHQ,
+      Xsse_UNPCKLB, Xsse_UNPCKLW, Xsse_UNPCKLD, Xsse_UNPCKLQ
+   }
+   X86SseOp;
+
+extern HChar* showX86SseOp ( X86SseOp );
+
+
+/* --------- */
+typedef
+   enum {
+      Xin_Alu32R,    /* 32-bit mov/arith/logical, dst=REG */
+      Xin_Alu32M,    /* 32-bit mov/arith/logical, dst=MEM */
+      Xin_Sh32,      /* 32-bit shift/rotate, dst=REG */
+      Xin_Test32,    /* 32-bit test of REG or MEM against imm32 (AND, set
+                        flags, discard result) */
+      Xin_Unary32,   /* 32-bit not and neg */
+      Xin_Lea32,     /* 32-bit compute EA into a reg */
+      Xin_MulL,      /* 32 x 32 -> 64 multiply */
+      Xin_Div,       /* 64/32 -> (32,32) div and mod */
+      Xin_Sh3232,    /* shldl or shrdl */
+      Xin_Push,      /* push (32-bit?) value on stack */
+      Xin_Call,      /* call to address in register */
+      Xin_Goto,      /* conditional/unconditional jmp to dst */
+      Xin_CMov32,    /* conditional move */
+      Xin_LoadEX,    /* mov{s,z}{b,w}l from mem to reg */
+      Xin_Store,     /* store 16/8 bit value in memory */
+      Xin_Set32,     /* convert condition code to 32-bit value */
+      Xin_Bsfr32,    /* 32-bit bsf/bsr */
+      Xin_MFence,    /* mem fence (not just sse2, but sse0 and 1 too) */
+      Xin_ACAS,      /* 8/16/32-bit lock;cmpxchg */
+      Xin_DACAS,     /* lock;cmpxchg8b (doubleword ACAS, 2 x 32-bit only) */
+
+      Xin_FpUnary,   /* FP fake unary op */
+      Xin_FpBinary,  /* FP fake binary op */
+      Xin_FpLdSt,    /* FP fake load/store */
+      Xin_FpLdStI,   /* FP fake load/store, converting to/from Int */
+      Xin_Fp64to32,  /* FP round IEEE754 double to IEEE754 single */
+      Xin_FpCMov,    /* FP fake floating point conditional move */
+      Xin_FpLdCW,    /* fldcw */
+      Xin_FpStSW_AX, /* fstsw %ax */
+      Xin_FpCmp,     /* FP compare, generating a C320 value into int reg */
+
+      Xin_SseConst,  /* Generate restricted SSE literal */
+      Xin_SseLdSt,   /* SSE load/store, no alignment constraints */
+      Xin_SseLdzLO,  /* SSE load low 32/64 bits, zero remainder of reg */
+      Xin_Sse32Fx4,  /* SSE binary, 32Fx4 */
+      Xin_Sse32FLo,  /* SSE binary, 32F in lowest lane only */
+      Xin_Sse64Fx2,  /* SSE binary, 64Fx2 */
+      Xin_Sse64FLo,  /* SSE binary, 64F in lowest lane only */
+      Xin_SseReRg,   /* SSE binary general reg-reg, Re, Rg */
+      Xin_SseCMov,   /* SSE conditional move */
+      Xin_SseShuf    /* SSE2 shuffle (pshufd) */
+   }
+   X86InstrTag;
+
+/* Destinations are on the RIGHT (second operand) */
+
+typedef
+   struct {
+      X86InstrTag tag;
+      union {
+         struct {
+            X86AluOp op;
+            X86RMI*  src;
+            HReg     dst;
+         } Alu32R;
+         struct {
+            X86AluOp  op;
+            X86RI*    src;
+            X86AMode* dst;
+         } Alu32M;
+         struct {
+            X86ShiftOp op;
+            UInt  src;  /* shift amount, or 0 means %cl */
+            HReg  dst;
+         } Sh32;
+         struct {
+            UInt   imm32;
+            X86RM* dst; /* not written, only read */
+         } Test32;
+         /* Not and Neg */
+         struct {
+            X86UnaryOp op;
+            HReg       dst;
+         } Unary32;
+         /* 32-bit compute EA into a reg */
+         struct {
+            X86AMode* am;
+            HReg      dst;
+         } Lea32;
+         /* EDX:EAX = EAX *s/u r/m32 */
+         struct {
+            Bool   syned;
+            X86RM* src;
+         } MulL;
+         /* x86 div/idiv instruction.  Modifies EDX and EAX and reads src. */
+         struct {
+            Bool   syned;
+            X86RM* src;
+         } Div;
+         /* shld/shrd.  op may only be Xsh_SHL or Xsh_SHR */
+         struct {
+            X86ShiftOp op;
+            UInt       amt;   /* shift amount, or 0 means %cl */
+            HReg       src;
+            HReg       dst;
+         } Sh3232;
+         struct {
+            X86RMI* src;
+         } Push;
+         /* Pseudo-insn.  Call target (an absolute address), on given
+            condition (which could be Xcc_ALWAYS). */
+         struct {
+            X86CondCode cond;
+            Addr32      target;
+            Int         regparms; /* 0 .. 3 */
+         } Call;
+         /* Pseudo-insn.  Goto dst, on given condition (which could be
+            Xcc_ALWAYS). */
+         struct {
+            IRJumpKind  jk;
+            X86CondCode cond;
+            X86RI*      dst;
+         } Goto;
+         /* Mov src to dst on the given condition, which may not
+            be the bogus Xcc_ALWAYS. */
+         struct {
+            X86CondCode cond;
+            X86RM*      src;
+            HReg        dst;
+         } CMov32;
+         /* Sign/Zero extending loads.  Dst size is always 32 bits. */
+         struct {
+            UChar     szSmall;
+            Bool      syned;
+            X86AMode* src;
+            HReg      dst;
+         } LoadEX;
+         /* 16/8 bit stores, which are troublesome (particularly
+            8-bit) */
+         struct {
+            UChar     sz; /* only 1 or 2 */
+            HReg      src;
+            X86AMode* dst;
+         } Store;
+         /* Convert a x86 condition code to a 32-bit value (0 or 1). */
+         struct {
+            X86CondCode cond;
+            HReg        dst;
+         } Set32;
+         /* 32-bit bsf or bsr. */
+         struct {
+            Bool isFwds;
+            HReg src;
+            HReg dst;
+         } Bsfr32;
+         /* Mem fence (not just sse2, but sse0 and 1 too).  In short,
+            an insn which flushes all preceding loads and stores as
+            much as possible before continuing.  On SSE2 we emit a
+            real "mfence", on SSE1 "sfence ; lock addl $0,0(%esp)" and
+            on SSE0 "lock addl $0,0(%esp)".  This insn therefore
+            carries the host's hwcaps so the assembler knows what to
+            emit. */
+         struct {
+            UInt hwcaps;
+         } MFence;
+         /* "lock;cmpxchg": mem address in .addr,
+             expected value in %eax, new value in %ebx */
+         struct {
+            X86AMode* addr;
+            UChar     sz; /* 1, 2 or 4 */
+         } ACAS;
+         /* "lock;cmpxchg8b": mem address in .addr, expected value in
+            %edx:%eax, new value in %ecx:%ebx */
+         struct {
+            X86AMode* addr;
+         } DACAS;
+
+         /* X86 Floating point (fake 3-operand, "flat reg file" insns) */
+         struct {
+            X86FpOp op;
+            HReg    src;
+            HReg    dst;
+         } FpUnary;
+         struct {
+            X86FpOp op;
+            HReg    srcL;
+            HReg    srcR;
+            HReg    dst;
+         } FpBinary;
+         struct {
+            Bool      isLoad;
+            UChar     sz; /* only 4 (IEEE single) or 8 (IEEE double) */
+            HReg      reg;
+            X86AMode* addr;
+         } FpLdSt;
+         /* Move 64-bit float to/from memory, converting to/from
+            signed int on the way.  Note the conversions will observe
+            the host FPU rounding mode currently in force. */
+         struct {
+            Bool      isLoad;
+            UChar     sz; /* only 2, 4 or 8 */
+            HReg      reg;
+            X86AMode* addr;
+         } FpLdStI;
+         /* By observing the current FPU rounding mode, round (etc)
+            src into dst given that dst should be interpreted as an
+            IEEE754 32-bit (float) type. */
+         struct {
+            HReg src;
+            HReg dst;
+         } Fp64to32;
+         /* Mov src to dst on the given condition, which may not
+            be the bogus Xcc_ALWAYS. */
+         struct {
+            X86CondCode cond;
+            HReg        src;
+            HReg        dst;
+         } FpCMov;
+         /* Load the FPU's 16-bit control word (fldcw) */
+         struct {
+            X86AMode* addr;
+         }
+         FpLdCW;
+         /* fstsw %ax */
+         struct {
+            /* no fields */
+         }
+         FpStSW_AX;
+         /* Do a compare, generating the C320 bits into the dst. */
+         struct {
+            HReg    srcL;
+            HReg    srcR;
+            HReg    dst;
+         } FpCmp;
+
+         /* Simplistic SSE[123] */
+         struct {
+            UShort  con;
+            HReg    dst;
+         } SseConst;
+         struct {
+            Bool      isLoad;
+            HReg      reg;
+            X86AMode* addr;
+         } SseLdSt;
+         struct {
+            UChar     sz; /* 4 or 8 only */
+            HReg      reg;
+            X86AMode* addr;
+         } SseLdzLO;
+         struct {
+            X86SseOp op;
+            HReg     src;
+            HReg     dst;
+         } Sse32Fx4;
+         struct {
+            X86SseOp op;
+            HReg     src;
+            HReg     dst;
+         } Sse32FLo;
+         struct {
+            X86SseOp op;
+            HReg     src;
+            HReg     dst;
+         } Sse64Fx2;
+         struct {
+            X86SseOp op;
+            HReg     src;
+            HReg     dst;
+         } Sse64FLo;
+         struct {
+            X86SseOp op;
+            HReg     src;
+            HReg     dst;
+         } SseReRg;
+         /* Mov src to dst on the given condition, which may not
+            be the bogus Xcc_ALWAYS. */
+         struct {
+            X86CondCode cond;
+            HReg        src;
+            HReg        dst;
+         } SseCMov;
+         struct {
+            Int    order; /* 0 <= order <= 0xFF */
+            HReg   src;
+            HReg   dst;
+         } SseShuf;
+
+      } Xin;
+   }
+   X86Instr;
+
+extern X86Instr* X86Instr_Alu32R    ( X86AluOp, X86RMI*, HReg );
+extern X86Instr* X86Instr_Alu32M    ( X86AluOp, X86RI*,  X86AMode* );
+extern X86Instr* X86Instr_Unary32   ( X86UnaryOp op, HReg dst );
+extern X86Instr* X86Instr_Lea32     ( X86AMode* am, HReg dst );
+
+extern X86Instr* X86Instr_Sh32      ( X86ShiftOp, UInt, HReg );
+extern X86Instr* X86Instr_Test32    ( UInt imm32, X86RM* dst );
+extern X86Instr* X86Instr_MulL      ( Bool syned, X86RM* );
+extern X86Instr* X86Instr_Div       ( Bool syned, X86RM* );
+extern X86Instr* X86Instr_Sh3232    ( X86ShiftOp, UInt amt, HReg src, HReg dst );
+extern X86Instr* X86Instr_Push      ( X86RMI* );
+extern X86Instr* X86Instr_Call      ( X86CondCode, Addr32, Int );
+extern X86Instr* X86Instr_Goto      ( IRJumpKind, X86CondCode cond, X86RI* dst );
+extern X86Instr* X86Instr_CMov32    ( X86CondCode, X86RM* src, HReg dst );
+extern X86Instr* X86Instr_LoadEX    ( UChar szSmall, Bool syned,
+                                      X86AMode* src, HReg dst );
+extern X86Instr* X86Instr_Store     ( UChar sz, HReg src, X86AMode* dst );
+extern X86Instr* X86Instr_Set32     ( X86CondCode cond, HReg dst );
+extern X86Instr* X86Instr_Bsfr32    ( Bool isFwds, HReg src, HReg dst );
+extern X86Instr* X86Instr_MFence    ( UInt hwcaps );
+extern X86Instr* X86Instr_ACAS      ( X86AMode* addr, UChar sz );
+extern X86Instr* X86Instr_DACAS     ( X86AMode* addr );
+
+extern X86Instr* X86Instr_FpUnary   ( X86FpOp op, HReg src, HReg dst );
+extern X86Instr* X86Instr_FpBinary  ( X86FpOp op, HReg srcL, HReg srcR, HReg dst );
+extern X86Instr* X86Instr_FpLdSt    ( Bool isLoad, UChar sz, HReg reg, X86AMode* );
+extern X86Instr* X86Instr_FpLdStI   ( Bool isLoad, UChar sz, HReg reg, X86AMode* );
+extern X86Instr* X86Instr_Fp64to32  ( HReg src, HReg dst );
+extern X86Instr* X86Instr_FpCMov    ( X86CondCode, HReg src, HReg dst );
+extern X86Instr* X86Instr_FpLdCW    ( X86AMode* );
+extern X86Instr* X86Instr_FpStSW_AX ( void );
+extern X86Instr* X86Instr_FpCmp     ( HReg srcL, HReg srcR, HReg dst );
+
+extern X86Instr* X86Instr_SseConst  ( UShort con, HReg dst );
+extern X86Instr* X86Instr_SseLdSt   ( Bool isLoad, HReg, X86AMode* );
+extern X86Instr* X86Instr_SseLdzLO  ( Int sz, HReg, X86AMode* );
+extern X86Instr* X86Instr_Sse32Fx4  ( X86SseOp, HReg, HReg );
+extern X86Instr* X86Instr_Sse32FLo  ( X86SseOp, HReg, HReg );
+extern X86Instr* X86Instr_Sse64Fx2  ( X86SseOp, HReg, HReg );
+extern X86Instr* X86Instr_Sse64FLo  ( X86SseOp, HReg, HReg );
+extern X86Instr* X86Instr_SseReRg   ( X86SseOp, HReg, HReg );
+extern X86Instr* X86Instr_SseCMov   ( X86CondCode, HReg src, HReg dst );
+extern X86Instr* X86Instr_SseShuf   ( Int order, HReg src, HReg dst );
+
+
+extern void ppX86Instr ( X86Instr*, Bool );
+
+/* Some functions that insulate the register allocator from details
+   of the underlying instruction set. */
+extern void         getRegUsage_X86Instr ( HRegUsage*, X86Instr*, Bool );
+extern void         mapRegs_X86Instr     ( HRegRemap*, X86Instr*, Bool );
+extern Bool         isMove_X86Instr      ( X86Instr*, HReg*, HReg* );
+extern Int          emit_X86Instr        ( UChar* buf, Int nbuf, X86Instr*, 
+                                           Bool, void* dispatch );
+
+extern void genSpill_X86  ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                            HReg rreg, Int offset, Bool );
+extern void genReload_X86 ( /*OUT*/HInstr** i1, /*OUT*/HInstr** i2,
+                            HReg rreg, Int offset, Bool );
+
+extern X86Instr*    directReload_X86     ( X86Instr* i, 
+                                           HReg vreg, Short spill_off );
+extern void         getAllocableRegs_X86 ( Int*, HReg** );
+extern HInstrArray* iselSB_X86           ( IRSB*, VexArch,
+                                                  VexArchInfo*,
+                                                  VexAbiInfo* );
+
+#endif /* ndef __VEX_HOST_X86_DEFS_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                     host_x86_defs.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/host_x86_isel.c b/VEX/priv/host_x86_isel.c
new file mode 100644
index 0000000..fc5cf05
--- /dev/null
+++ b/VEX/priv/host_x86_isel.c

@@ -0,0 +1,4079 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                   host_x86_isel.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+
+#include "ir_match.h"
+#include "main_util.h"
+#include "main_globals.h"
+#include "host_generic_regs.h"
+#include "host_generic_simd64.h"
+#include "host_x86_defs.h"
+
+/* TODO 21 Apr 2005:
+
+   -- (Really an assembler issue) don't emit CMov32 as a cmov
+      insn, since that's expensive on P4 and conditional branch
+      is cheaper if (as we expect) the condition is highly predictable
+
+   -- preserve xmm registers across function calls (by declaring them
+      as trashed by call insns)
+
+   -- preserve x87 ST stack discipline across function calls.  Sigh.
+
+   -- Check doHelperCall: if a call is conditional, we cannot safely
+      compute any regparm args directly to registers.  Hence, the
+      fast-regparm marshalling should be restricted to unconditional
+      calls only.
+*/
+
+/*---------------------------------------------------------*/
+/*--- x87 control word stuff                            ---*/
+/*---------------------------------------------------------*/
+
+/* Vex-generated code expects to run with the FPU set as follows: all
+   exceptions masked, round-to-nearest, precision = 53 bits.  This
+   corresponds to a FPU control word value of 0x027F.
+
+   Similarly the SSE control word (%mxcsr) should be 0x1F80.
+
+   %fpucw and %mxcsr should have these values on entry to
+   Vex-generated code, and should those values should be
+   unchanged at exit.
+*/
+
+#define DEFAULT_FPUCW 0x027F
+
+/* debugging only, do not use */
+/* define DEFAULT_FPUCW 0x037F */
+
+
+/*---------------------------------------------------------*/
+/*--- misc helpers                                      ---*/
+/*---------------------------------------------------------*/
+
+/* These are duplicated in guest-x86/toIR.c */
+static IRExpr* unop ( IROp op, IRExpr* a )
+{
+   return IRExpr_Unop(op, a);
+}
+
+static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
+{
+   return IRExpr_Binop(op, a1, a2);
+}
+
+static IRExpr* bind ( Int binder )
+{
+   return IRExpr_Binder(binder);
+}
+
+static Bool isZeroU8 ( IRExpr* e )
+{
+   return e->tag == Iex_Const
+          && e->Iex.Const.con->tag == Ico_U8
+          && e->Iex.Const.con->Ico.U8 == 0;
+}
+
+static Bool isZeroU32 ( IRExpr* e )
+{
+   return e->tag == Iex_Const
+          && e->Iex.Const.con->tag == Ico_U32
+          && e->Iex.Const.con->Ico.U32 == 0;
+}
+
+static Bool isZeroU64 ( IRExpr* e )
+{
+   return e->tag == Iex_Const
+          && e->Iex.Const.con->tag == Ico_U64
+          && e->Iex.Const.con->Ico.U64 == 0ULL;
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISelEnv                                           ---*/
+/*---------------------------------------------------------*/
+
+/* This carries around:
+
+   - A mapping from IRTemp to IRType, giving the type of any IRTemp we
+     might encounter.  This is computed before insn selection starts,
+     and does not change.
+
+   - A mapping from IRTemp to HReg.  This tells the insn selector
+     which virtual register(s) are associated with each IRTemp
+     temporary.  This is computed before insn selection starts, and
+     does not change.  We expect this mapping to map precisely the
+     same set of IRTemps as the type mapping does.
+
+        - vregmap   holds the primary register for the IRTemp.
+        - vregmapHI is only used for 64-bit integer-typed
+             IRTemps.  It holds the identity of a second
+             32-bit virtual HReg, which holds the high half
+             of the value.
+
+   - The code array, that is, the insns selected so far.
+
+   - A counter, for generating new virtual registers.
+
+   - The host subarchitecture we are selecting insns for.  
+     This is set at the start and does not change.
+
+   Note, this is all host-independent.  */
+
+typedef
+   struct {
+      IRTypeEnv*   type_env;
+
+      HReg*        vregmap;
+      HReg*        vregmapHI;
+      Int          n_vregmap;
+
+      HInstrArray* code;
+
+      Int          vreg_ctr;
+
+      UInt         hwcaps;
+   }
+   ISelEnv;
+
+
+static HReg lookupIRTemp ( ISelEnv* env, IRTemp tmp )
+{
+   vassert(tmp >= 0);
+   vassert(tmp < env->n_vregmap);
+   return env->vregmap[tmp];
+}
+
+static void lookupIRTemp64 ( HReg* vrHI, HReg* vrLO, ISelEnv* env, IRTemp tmp )
+{
+   vassert(tmp >= 0);
+   vassert(tmp < env->n_vregmap);
+   vassert(env->vregmapHI[tmp] != INVALID_HREG);
+   *vrLO = env->vregmap[tmp];
+   *vrHI = env->vregmapHI[tmp];
+}
+
+static void addInstr ( ISelEnv* env, X86Instr* instr )
+{
+   addHInstr(env->code, instr);
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      ppX86Instr(instr, False);
+      vex_printf("\n");
+   }
+}
+
+static HReg newVRegI ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcInt32, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+static HReg newVRegF ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcFlt64, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+static HReg newVRegV ( ISelEnv* env )
+{
+   HReg reg = mkHReg(env->vreg_ctr, HRcVec128, True/*virtual reg*/);
+   env->vreg_ctr++;
+   return reg;
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Forward declarations                        ---*/
+/*---------------------------------------------------------*/
+
+/* These are organised as iselXXX and iselXXX_wrk pairs.  The
+   iselXXX_wrk do the real work, but are not to be called directly.
+   For each XXX, iselXXX calls its iselXXX_wrk counterpart, then
+   checks that all returned registers are virtual.  You should not
+   call the _wrk version directly.
+*/
+static X86RMI*     iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e );
+static X86RMI*     iselIntExpr_RMI     ( ISelEnv* env, IRExpr* e );
+
+static X86RI*      iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e );
+static X86RI*      iselIntExpr_RI     ( ISelEnv* env, IRExpr* e );
+
+static X86RM*      iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e );
+static X86RM*      iselIntExpr_RM     ( ISelEnv* env, IRExpr* e );
+
+static HReg        iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e );
+static HReg        iselIntExpr_R     ( ISelEnv* env, IRExpr* e );
+
+static X86AMode*   iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e );
+static X86AMode*   iselIntExpr_AMode     ( ISelEnv* env, IRExpr* e );
+
+static void        iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, 
+                                       ISelEnv* env, IRExpr* e );
+static void        iselInt64Expr     ( HReg* rHi, HReg* rLo, 
+                                       ISelEnv* env, IRExpr* e );
+
+static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e );
+static X86CondCode iselCondCode     ( ISelEnv* env, IRExpr* e );
+
+static HReg        iselDblExpr_wrk ( ISelEnv* env, IRExpr* e );
+static HReg        iselDblExpr     ( ISelEnv* env, IRExpr* e );
+
+static HReg        iselFltExpr_wrk ( ISelEnv* env, IRExpr* e );
+static HReg        iselFltExpr     ( ISelEnv* env, IRExpr* e );
+
+static HReg        iselVecExpr_wrk ( ISelEnv* env, IRExpr* e );
+static HReg        iselVecExpr     ( ISelEnv* env, IRExpr* e );
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Misc helpers                                ---*/
+/*---------------------------------------------------------*/
+
+/* Make a int reg-reg move. */
+
+static X86Instr* mk_iMOVsd_RR ( HReg src, HReg dst )
+{
+   vassert(hregClass(src) == HRcInt32);
+   vassert(hregClass(dst) == HRcInt32);
+   return X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst);
+}
+
+
+/* Make a vector reg-reg move. */
+
+static X86Instr* mk_vMOVsd_RR ( HReg src, HReg dst )
+{
+   vassert(hregClass(src) == HRcVec128);
+   vassert(hregClass(dst) == HRcVec128);
+   return X86Instr_SseReRg(Xsse_MOV, src, dst);
+}
+
+/* Advance/retreat %esp by n. */
+
+static void add_to_esp ( ISelEnv* env, Int n )
+{
+   vassert(n > 0 && n < 256 && (n%4) == 0);
+   addInstr(env, 
+            X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(n), hregX86_ESP()));
+}
+
+static void sub_from_esp ( ISelEnv* env, Int n )
+{
+   vassert(n > 0 && n < 256 && (n%4) == 0);
+   addInstr(env, 
+            X86Instr_Alu32R(Xalu_SUB, X86RMI_Imm(n), hregX86_ESP()));
+}
+
+
+/* Given an amode, return one which references 4 bytes further
+   along. */
+
+static X86AMode* advance4 ( X86AMode* am )
+{
+   X86AMode* am4 = dopyX86AMode(am);
+   switch (am4->tag) {
+      case Xam_IRRS:
+         am4->Xam.IRRS.imm += 4; break;
+      case Xam_IR:
+         am4->Xam.IR.imm += 4; break;
+      default:
+         vpanic("advance4(x86,host)");
+   }
+   return am4;
+}
+
+
+/* Push an arg onto the host stack, in preparation for a call to a
+   helper function of some kind.  Returns the number of 32-bit words
+   pushed. */
+
+static Int pushArg ( ISelEnv* env, IRExpr* arg )
+{
+   IRType arg_ty = typeOfIRExpr(env->type_env, arg);
+   if (arg_ty == Ity_I32) {
+      addInstr(env, X86Instr_Push(iselIntExpr_RMI(env, arg)));
+      return 1;
+   } else 
+   if (arg_ty == Ity_I64) {
+      HReg rHi, rLo;
+      iselInt64Expr(&rHi, &rLo, env, arg);
+      addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
+      addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
+      return 2;
+   }
+   ppIRExpr(arg);
+   vpanic("pushArg(x86): can't handle arg of this type");
+}
+
+
+/* Complete the call to a helper function, by calling the 
+   helper and clearing the args off the stack. */
+
+static 
+void callHelperAndClearArgs ( ISelEnv* env, X86CondCode cc, 
+                              IRCallee* cee, Int n_arg_ws )
+{
+   /* Complication.  Need to decide which reg to use as the fn address
+      pointer, in a way that doesn't trash regparm-passed
+      parameters. */
+   vassert(sizeof(void*) == 4);
+
+   addInstr(env, X86Instr_Call( cc, toUInt(Ptr_to_ULong(cee->addr)),
+                                    cee->regparms));
+   if (n_arg_ws > 0)
+      add_to_esp(env, 4*n_arg_ws);
+}
+
+
+/* Used only in doHelperCall.  See big comment in doHelperCall re
+   handling of regparm args.  This function figures out whether
+   evaluation of an expression might require use of a fixed register.
+   If in doubt return True (safe but suboptimal).  
+*/
+static
+Bool mightRequireFixedRegs ( IRExpr* e )
+{
+   switch (e->tag) {
+      case Iex_RdTmp: case Iex_Const: case Iex_Get: 
+         return False;
+      default:
+         return True;
+   }
+}
+
+
+/* Do a complete function call.  guard is a Ity_Bit expression
+   indicating whether or not the call happens.  If guard==NULL, the
+   call is unconditional. */
+
+static
+void doHelperCall ( ISelEnv* env, 
+                    Bool passBBP, 
+                    IRExpr* guard, IRCallee* cee, IRExpr** args )
+{
+   X86CondCode cc;
+   HReg        argregs[3];
+   HReg        tmpregs[3];
+   Bool        danger;
+   Int         not_done_yet, n_args, n_arg_ws, stack_limit, 
+               i, argreg, argregX;
+
+   /* Marshal args for a call, do the call, and clear the stack.
+      Complexities to consider:
+
+      * if passBBP is True, %ebp (the baseblock pointer) is to be
+        passed as the first arg.
+
+      * If the callee claims regparmness of 1, 2 or 3, we must pass the
+        first 1, 2 or 3 args in registers (EAX, EDX, and ECX
+        respectively).  To keep things relatively simple, only args of
+        type I32 may be passed as regparms -- just bomb out if anything
+        else turns up.  Clearly this depends on the front ends not
+        trying to pass any other types as regparms.  
+   */
+
+   /* 16 Nov 2004: the regparm handling is complicated by the
+      following problem.
+
+      Consider a call two a function with two regparm parameters:
+      f(e1,e2).  We need to compute e1 into %eax and e2 into %edx.
+      Suppose code is first generated to compute e1 into %eax.  Then,
+      code is generated to compute e2 into %edx.  Unfortunately, if
+      the latter code sequence uses %eax, it will trash the value of
+      e1 computed by the former sequence.  This could happen if (for
+      example) e2 itself involved a function call.  In the code below,
+      args are evaluated right-to-left, not left-to-right, but the
+      principle and the problem are the same.
+
+      One solution is to compute all regparm-bound args into vregs
+      first, and once they are all done, move them to the relevant
+      real regs.  This always gives correct code, but it also gives
+      a bunch of vreg-to-rreg moves which are usually redundant but 
+      are hard for the register allocator to get rid of.
+
+      A compromise is to first examine all regparm'd argument 
+      expressions.  If they are all so simple that it is clear 
+      they will be evaluated without use of any fixed registers,
+      use the old compute-directly-to-fixed-target scheme.  If not,
+      be safe and use the via-vregs scheme.
+
+      Note this requires being able to examine an expression and
+      determine whether or not evaluation of it might use a fixed
+      register.  That requires knowledge of how the rest of this
+      insn selector works.  Currently just the following 3 are 
+      regarded as safe -- hopefully they cover the majority of
+      arguments in practice: IRExpr_Tmp IRExpr_Const IRExpr_Get.
+   */
+   vassert(cee->regparms >= 0 && cee->regparms <= 3);
+
+   n_args = n_arg_ws = 0;
+   while (args[n_args]) n_args++;
+
+   not_done_yet = n_args;
+   if (passBBP)
+      not_done_yet++;
+
+   stack_limit = cee->regparms;
+   if (cee->regparms > 0 && passBBP) stack_limit--;
+
+   /* ------ BEGIN marshall all arguments ------ */
+
+   /* Push (R to L) the stack-passed args, [n_args-1 .. stack_limit] */
+   for (i = n_args-1; i >= stack_limit; i--) {
+      n_arg_ws += pushArg(env, args[i]);
+      not_done_yet--;
+   }
+
+   /* args [stack_limit-1 .. 0] and possibly %ebp are to be passed in
+      registers. */
+
+   if (cee->regparms > 0) {
+
+      /* ------ BEGIN deal with regparms ------ */
+
+      /* deal with regparms, not forgetting %ebp if needed. */
+      argregs[0] = hregX86_EAX();
+      argregs[1] = hregX86_EDX();
+      argregs[2] = hregX86_ECX();
+      tmpregs[0] = tmpregs[1] = tmpregs[2] = INVALID_HREG;
+
+      argreg = cee->regparms;
+
+      /* In keeping with big comment above, detect potential danger
+         and use the via-vregs scheme if needed. */
+      danger = False;
+      for (i = stack_limit-1; i >= 0; i--) {
+         if (mightRequireFixedRegs(args[i])) {
+            danger = True;
+            break;
+         }
+      }
+
+      if (danger) {
+
+         /* Move via temporaries */
+         argregX = argreg;
+         for (i = stack_limit-1; i >= 0; i--) {
+
+            if (0) {
+               vex_printf("x86 host: register param is complex: ");
+               ppIRExpr(args[i]);
+               vex_printf("\n");
+            }
+
+            argreg--;
+            vassert(argreg >= 0);
+            vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32);
+            tmpregs[argreg] = iselIntExpr_R(env, args[i]);
+            not_done_yet--;
+         }
+         for (i = stack_limit-1; i >= 0; i--) {
+            argregX--;
+            vassert(argregX >= 0);
+            addInstr( env, mk_iMOVsd_RR( tmpregs[argregX], argregs[argregX] ) );
+         }
+
+      } else {
+         /* It's safe to compute all regparm args directly into their
+            target registers. */
+         for (i = stack_limit-1; i >= 0; i--) {
+            argreg--;
+            vassert(argreg >= 0);
+            vassert(typeOfIRExpr(env->type_env, args[i]) == Ity_I32);
+            addInstr(env, X86Instr_Alu32R(Xalu_MOV, 
+                                          iselIntExpr_RMI(env, args[i]),
+                                          argregs[argreg]));
+            not_done_yet--;
+         }
+
+      }
+
+      /* Not forgetting %ebp if needed. */
+      if (passBBP) {
+         vassert(argreg == 1);
+         addInstr(env, mk_iMOVsd_RR( hregX86_EBP(), argregs[0]));
+         not_done_yet--;
+      }
+
+      /* ------ END deal with regparms ------ */
+
+   } else {
+
+      /* No regparms.  Heave %ebp on the stack if needed. */
+      if (passBBP) {
+         addInstr(env, X86Instr_Push(X86RMI_Reg(hregX86_EBP())));
+         n_arg_ws++;
+         not_done_yet--;
+      }
+
+   }
+
+   vassert(not_done_yet == 0);
+
+   /* ------ END marshall all arguments ------ */
+
+   /* Now we can compute the condition.  We can't do it earlier
+      because the argument computations could trash the condition
+      codes.  Be a bit clever to handle the common case where the
+      guard is 1:Bit. */
+   cc = Xcc_ALWAYS;
+   if (guard) {
+      if (guard->tag == Iex_Const 
+          && guard->Iex.Const.con->tag == Ico_U1
+          && guard->Iex.Const.con->Ico.U1 == True) {
+         /* unconditional -- do nothing */
+      } else {
+         cc = iselCondCode( env, guard );
+      }
+   }
+
+   /* call the helper, and get the args off the stack afterwards. */
+   callHelperAndClearArgs( env, cc, cee, n_arg_ws );
+}
+
+
+/* Given a guest-state array descriptor, an index expression and a
+   bias, generate an X86AMode holding the relevant guest state
+   offset. */
+
+static
+X86AMode* genGuestArrayOffset ( ISelEnv* env, IRRegArray* descr, 
+                                IRExpr* off, Int bias )
+{
+   HReg tmp, roff;
+   Int  elemSz = sizeofIRType(descr->elemTy);
+   Int  nElems = descr->nElems;
+   Int  shift  = 0;
+
+   /* throw out any cases not generated by an x86 front end.  In
+      theory there might be a day where we need to handle them -- if
+      we ever run non-x86-guest on x86 host. */
+
+   if (nElems != 8) 
+      vpanic("genGuestArrayOffset(x86 host)(1)");
+
+   switch (elemSz) {
+      case 1:  shift = 0; break;
+      case 4:  shift = 2; break;
+      case 8:  shift = 3; break;
+      default: vpanic("genGuestArrayOffset(x86 host)(2)");
+   }
+
+   /* Compute off into a reg, %off.  Then return:
+
+         movl %off, %tmp
+         addl $bias, %tmp  (if bias != 0)
+         andl %tmp, 7
+         ... base(%ebp, %tmp, shift) ...
+   */
+   tmp  = newVRegI(env);
+   roff = iselIntExpr_R(env, off);
+   addInstr(env, mk_iMOVsd_RR(roff, tmp));
+   if (bias != 0) {
+      addInstr(env, 
+               X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(bias), tmp));
+   }
+   addInstr(env, 
+            X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(7), tmp));
+   return
+      X86AMode_IRRS( descr->base, hregX86_EBP(), tmp, shift );
+}
+
+
+/* Mess with the FPU's rounding mode: set to the default rounding mode
+   (DEFAULT_FPUCW). */
+static 
+void set_FPU_rounding_default ( ISelEnv* env )
+{
+   /* pushl $DEFAULT_FPUCW
+      fldcw 0(%esp)
+      addl $4, %esp 
+   */
+   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+   addInstr(env, X86Instr_Push(X86RMI_Imm(DEFAULT_FPUCW)));
+   addInstr(env, X86Instr_FpLdCW(zero_esp));
+   add_to_esp(env, 4);
+}
+
+
+/* Mess with the FPU's rounding mode: 'mode' is an I32-typed
+   expression denoting a value in the range 0 .. 3, indicating a round
+   mode encoded as per type IRRoundingMode.  Set the x87 FPU to have
+   the same rounding.
+*/
+static
+void set_FPU_rounding_mode ( ISelEnv* env, IRExpr* mode )
+{
+   HReg rrm  = iselIntExpr_R(env, mode);
+   HReg rrm2 = newVRegI(env);
+   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+
+   /* movl  %rrm, %rrm2
+      andl  $3, %rrm2   -- shouldn't be needed; paranoia
+      shll  $10, %rrm2
+      orl   $DEFAULT_FPUCW, %rrm2
+      pushl %rrm2
+      fldcw 0(%esp)
+      addl  $4, %esp
+   */
+   addInstr(env, mk_iMOVsd_RR(rrm, rrm2));
+   addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(3), rrm2));
+   addInstr(env, X86Instr_Sh32(Xsh_SHL, 10, rrm2));
+   addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Imm(DEFAULT_FPUCW), rrm2));
+   addInstr(env, X86Instr_Push(X86RMI_Reg(rrm2)));
+   addInstr(env, X86Instr_FpLdCW(zero_esp));
+   add_to_esp(env, 4);
+}
+
+
+/* Generate !src into a new vector register, and be sure that the code
+   is SSE1 compatible.  Amazing that Intel doesn't offer a less crappy
+   way to do this. 
+*/
+static HReg do_sse_Not128 ( ISelEnv* env, HReg src )
+{
+   HReg dst = newVRegV(env);
+   /* Set dst to zero.  If dst contains a NaN then all hell might
+      break loose after the comparison.  So, first zero it. */
+   addInstr(env, X86Instr_SseReRg(Xsse_XOR, dst, dst));
+   /* And now make it all 1s ... */
+   addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, dst, dst));
+   /* Finally, xor 'src' into it. */
+   addInstr(env, X86Instr_SseReRg(Xsse_XOR, src, dst));
+   /* Doesn't that just totally suck? */
+   return dst;
+}
+
+
+/* Round an x87 FPU value to 53-bit-mantissa precision, to be used
+   after most non-simple FPU operations (simple = +, -, *, / and
+   sqrt).
+
+   This could be done a lot more efficiently if needed, by loading
+   zero and adding it to the value to be rounded (fldz ; faddp?).
+*/
+static void roundToF64 ( ISelEnv* env, HReg reg )
+{
+   X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+   sub_from_esp(env, 8);
+   addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, reg, zero_esp));
+   addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, reg, zero_esp));
+   add_to_esp(env, 8);
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expressions (32/16/8 bit)           ---*/
+/*---------------------------------------------------------*/
+
+/* Select insns for an integer-typed expression, and add them to the
+   code list.  Return a reg holding the result.  This reg will be a
+   virtual register.  THE RETURNED REG MUST NOT BE MODIFIED.  If you
+   want to modify it, ask for a new vreg, copy it in there, and modify
+   the copy.  The register allocator will do its best to map both
+   vregs to the same real register, so the copies will often disappear
+   later in the game.
+
+   This should handle expressions of 32, 16 and 8-bit type.  All
+   results are returned in a 32-bit register.  For 16- and 8-bit
+   expressions, the upper 16/24 bits are arbitrary, so you should mask
+   or sign extend partial values if necessary.
+*/
+
+static HReg iselIntExpr_R ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselIntExpr_R_wrk(env, e);
+   /* sanity checks ... */
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcInt32);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static HReg iselIntExpr_R_wrk ( ISelEnv* env, IRExpr* e )
+{
+   MatchInfo mi;
+
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
+
+   switch (e->tag) {
+
+   /* --------- TEMP --------- */
+   case Iex_RdTmp: {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   /* --------- LOAD --------- */
+   case Iex_Load: {
+      HReg dst = newVRegI(env);
+      X86AMode* amode = iselIntExpr_AMode ( env, e->Iex.Load.addr );
+
+      /* We can't handle big-endian loads, nor load-linked. */
+      if (e->Iex.Load.end != Iend_LE)
+         goto irreducible;
+
+      if (ty == Ity_I32) {
+         addInstr(env, X86Instr_Alu32R(Xalu_MOV,
+                                       X86RMI_Mem(amode), dst) );
+         return dst;
+      }
+      if (ty == Ity_I16) {
+         addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
+         return dst;
+      }
+      if (ty == Ity_I8) {
+         addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
+         return dst;
+      }
+      break;
+   }
+
+   /* --------- TERNARY OP --------- */
+   case Iex_Triop: {
+      /* C3210 flags following FPU partial remainder (fprem), both
+         IEEE compliant (PREM1) and non-IEEE compliant (PREM). */
+      if (e->Iex.Triop.op == Iop_PRemC3210F64
+          || e->Iex.Triop.op == Iop_PRem1C3210F64) {
+         HReg junk = newVRegF(env);
+         HReg dst  = newVRegI(env);
+         HReg srcL = iselDblExpr(env, e->Iex.Triop.arg2);
+         HReg srcR = iselDblExpr(env, e->Iex.Triop.arg3);
+         /* XXXROUNDINGFIXME */
+         /* set roundingmode here */
+         addInstr(env, X86Instr_FpBinary(
+                           e->Iex.Binop.op==Iop_PRemC3210F64 
+                              ? Xfp_PREM : Xfp_PREM1,
+                           srcL,srcR,junk
+                 ));
+         /* The previous pseudo-insn will have left the FPU's C3210
+            flags set correctly.  So bag them. */
+         addInstr(env, X86Instr_FpStSW_AX());
+         addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
+         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0x4700), dst));
+         return dst;
+      }
+
+      break;
+   }
+
+   /* --------- BINARY OP --------- */
+   case Iex_Binop: {
+      X86AluOp   aluOp;
+      X86ShiftOp shOp;
+
+      /* Pattern: Sub32(0,x) */
+      if (e->Iex.Binop.op == Iop_Sub32 && isZeroU32(e->Iex.Binop.arg1)) {
+         HReg dst = newVRegI(env);
+         HReg reg = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, mk_iMOVsd_RR(reg,dst));
+         addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
+         return dst;
+      }
+
+      /* Is it an addition or logical style op? */
+      switch (e->Iex.Binop.op) {
+         case Iop_Add8: case Iop_Add16: case Iop_Add32:
+            aluOp = Xalu_ADD; break;
+         case Iop_Sub8: case Iop_Sub16: case Iop_Sub32: 
+            aluOp = Xalu_SUB; break;
+         case Iop_And8: case Iop_And16: case Iop_And32: 
+            aluOp = Xalu_AND; break;
+         case Iop_Or8: case Iop_Or16: case Iop_Or32:  
+            aluOp = Xalu_OR; break;
+         case Iop_Xor8: case Iop_Xor16: case Iop_Xor32: 
+            aluOp = Xalu_XOR; break;
+         case Iop_Mul16: case Iop_Mul32: 
+            aluOp = Xalu_MUL; break;
+         default:
+            aluOp = Xalu_INVALID; break;
+      }
+      /* For commutative ops we assume any literal
+         values are on the second operand. */
+      if (aluOp != Xalu_INVALID) {
+         HReg dst    = newVRegI(env);
+         HReg reg    = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+         addInstr(env, mk_iMOVsd_RR(reg,dst));
+         addInstr(env, X86Instr_Alu32R(aluOp, rmi, dst));
+         return dst;
+      }
+      /* Could do better here; forcing the first arg into a reg
+         isn't always clever.
+         -- t70 = Xor32(And32(Xor32(LDle:I32(Add32(t41,0xFFFFFFA0:I32)),
+                        LDle:I32(Add32(t41,0xFFFFFFA4:I32))),LDle:I32(Add32(
+                        t41,0xFFFFFFA8:I32))),LDle:I32(Add32(t41,0xFFFFFFA0:I32)))
+            movl 0xFFFFFFA0(%vr41),%vr107
+            movl 0xFFFFFFA4(%vr41),%vr108
+            movl %vr107,%vr106
+            xorl %vr108,%vr106
+            movl 0xFFFFFFA8(%vr41),%vr109
+            movl %vr106,%vr105
+            andl %vr109,%vr105
+            movl 0xFFFFFFA0(%vr41),%vr110
+            movl %vr105,%vr104
+            xorl %vr110,%vr104
+            movl %vr104,%vr70
+      */
+
+      /* Perhaps a shift op? */
+      switch (e->Iex.Binop.op) {
+         case Iop_Shl32: case Iop_Shl16: case Iop_Shl8:
+            shOp = Xsh_SHL; break;
+         case Iop_Shr32: case Iop_Shr16: case Iop_Shr8: 
+            shOp = Xsh_SHR; break;
+         case Iop_Sar32: case Iop_Sar16: case Iop_Sar8: 
+            shOp = Xsh_SAR; break;
+         default:
+            shOp = Xsh_INVALID; break;
+      }
+      if (shOp != Xsh_INVALID) {
+         HReg dst = newVRegI(env);
+
+         /* regL = the value to be shifted */
+         HReg regL   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         addInstr(env, mk_iMOVsd_RR(regL,dst));
+
+         /* Do any necessary widening for 16/8 bit operands */
+         switch (e->Iex.Binop.op) {
+            case Iop_Shr8:
+               addInstr(env, X86Instr_Alu32R(
+                                Xalu_AND, X86RMI_Imm(0xFF), dst));
+               break;
+            case Iop_Shr16:
+               addInstr(env, X86Instr_Alu32R(
+                                Xalu_AND, X86RMI_Imm(0xFFFF), dst));
+               break;
+            case Iop_Sar8:
+               addInstr(env, X86Instr_Sh32(Xsh_SHL, 24, dst));
+               addInstr(env, X86Instr_Sh32(Xsh_SAR, 24, dst));
+               break;
+            case Iop_Sar16:
+               addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, dst));
+               addInstr(env, X86Instr_Sh32(Xsh_SAR, 16, dst));
+               break;
+            default: break;
+         }
+
+         /* Now consider the shift amount.  If it's a literal, we
+            can do a much better job than the general case. */
+         if (e->Iex.Binop.arg2->tag == Iex_Const) {
+            /* assert that the IR is well-typed */
+            Int nshift;
+            vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
+            nshift = e->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+	    vassert(nshift >= 0);
+	    if (nshift > 0)
+               /* Can't allow nshift==0 since that means %cl */
+               addInstr(env, X86Instr_Sh32( shOp, nshift, dst ));
+         } else {
+            /* General case; we have to force the amount into %cl. */
+            HReg regR = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            addInstr(env, mk_iMOVsd_RR(regR,hregX86_ECX()));
+            addInstr(env, X86Instr_Sh32(shOp, 0/* %cl */, dst));
+         }
+         return dst;
+      }
+
+      /* Handle misc other ops. */
+
+      if (e->Iex.Binop.op == Iop_Max32U) {
+         HReg src1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg dst  = newVRegI(env);
+         HReg src2 = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, mk_iMOVsd_RR(src1,dst));
+         addInstr(env, X86Instr_Alu32R(Xalu_CMP, X86RMI_Reg(src2), dst));
+         addInstr(env, X86Instr_CMov32(Xcc_B, X86RM_Reg(src2), dst));
+         return dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_8HLto16) {
+         HReg hi8  = newVRegI(env);
+         HReg lo8  = newVRegI(env);
+         HReg hi8s = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg lo8s = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, mk_iMOVsd_RR(hi8s, hi8));
+         addInstr(env, mk_iMOVsd_RR(lo8s, lo8));
+         addInstr(env, X86Instr_Sh32(Xsh_SHL, 8, hi8));
+         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFF), lo8));
+         addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo8), hi8));
+         return hi8;
+      }
+
+      if (e->Iex.Binop.op == Iop_16HLto32) {
+         HReg hi16  = newVRegI(env);
+         HReg lo16  = newVRegI(env);
+         HReg hi16s = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg lo16s = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         addInstr(env, mk_iMOVsd_RR(hi16s, hi16));
+         addInstr(env, mk_iMOVsd_RR(lo16s, lo16));
+         addInstr(env, X86Instr_Sh32(Xsh_SHL, 16, hi16));
+         addInstr(env, X86Instr_Alu32R(Xalu_AND, X86RMI_Imm(0xFFFF), lo16));
+         addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(lo16), hi16));
+         return hi16;
+      }
+
+      if (e->Iex.Binop.op == Iop_MullS16 || e->Iex.Binop.op == Iop_MullS8
+          || e->Iex.Binop.op == Iop_MullU16 || e->Iex.Binop.op == Iop_MullU8) {
+         HReg a16   = newVRegI(env);
+         HReg b16   = newVRegI(env);
+         HReg a16s  = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg b16s  = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         Int  shift = (e->Iex.Binop.op == Iop_MullS8 
+                       || e->Iex.Binop.op == Iop_MullU8)
+                         ? 24 : 16;
+         X86ShiftOp shr_op = (e->Iex.Binop.op == Iop_MullS8 
+                              || e->Iex.Binop.op == Iop_MullS16)
+                                ? Xsh_SAR : Xsh_SHR;
+
+         addInstr(env, mk_iMOVsd_RR(a16s, a16));
+         addInstr(env, mk_iMOVsd_RR(b16s, b16));
+         addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, a16));
+         addInstr(env, X86Instr_Sh32(Xsh_SHL, shift, b16));
+         addInstr(env, X86Instr_Sh32(shr_op,  shift, a16));
+         addInstr(env, X86Instr_Sh32(shr_op,  shift, b16));
+         addInstr(env, X86Instr_Alu32R(Xalu_MUL, X86RMI_Reg(a16), b16));
+         return b16;
+      }
+
+      if (e->Iex.Binop.op == Iop_CmpF64) {
+         HReg fL = iselDblExpr(env, e->Iex.Binop.arg1);
+         HReg fR = iselDblExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegI(env);
+         addInstr(env, X86Instr_FpCmp(fL,fR,dst));
+         /* shift this right 8 bits so as to conform to CmpF64
+            definition. */
+         addInstr(env, X86Instr_Sh32(Xsh_SHR, 8, dst));
+         return dst;
+      }
+
+      if (e->Iex.Binop.op == Iop_F64toI32S
+          || e->Iex.Binop.op == Iop_F64toI16S) {
+         Int  sz  = e->Iex.Binop.op == Iop_F64toI16S ? 2 : 4;
+         HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegI(env);
+
+         /* Used several times ... */
+         X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+
+	 /* rf now holds the value to be converted, and rrm holds the
+	    rounding mode value, encoded as per the IRRoundingMode
+	    enum.  The first thing to do is set the FPU's rounding
+	    mode accordingly. */
+
+         /* Create a space for the format conversion. */
+         /* subl $4, %esp */
+         sub_from_esp(env, 4);
+
+	 /* Set host rounding mode */
+	 set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+
+         /* gistw/l %rf, 0(%esp) */
+         addInstr(env, X86Instr_FpLdStI(False/*store*/, 
+                                        toUChar(sz), rf, zero_esp));
+
+         if (sz == 2) {
+            /* movzwl 0(%esp), %dst */
+            addInstr(env, X86Instr_LoadEX(2,False,zero_esp,dst));
+         } else {
+            /* movl 0(%esp), %dst */
+            vassert(sz == 4);
+            addInstr(env, X86Instr_Alu32R(
+                             Xalu_MOV, X86RMI_Mem(zero_esp), dst));
+         }
+
+	 /* Restore default FPU rounding. */
+         set_FPU_rounding_default( env );
+
+         /* addl $4, %esp */
+	 add_to_esp(env, 4);
+         return dst;
+      }
+
+      break;
+   }
+
+   /* --------- UNARY OP --------- */
+   case Iex_Unop: {
+
+      /* 1Uto8(32to1(expr32)) */
+      if (e->Iex.Unop.op == Iop_1Uto8) { 
+         DECLARE_PATTERN(p_32to1_then_1Uto8);
+         DEFINE_PATTERN(p_32to1_then_1Uto8,
+                        unop(Iop_1Uto8,unop(Iop_32to1,bind(0))));
+         if (matchIRExpr(&mi,p_32to1_then_1Uto8,e)) {
+            IRExpr* expr32 = mi.bindee[0];
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, expr32);
+            addInstr(env, mk_iMOVsd_RR(src,dst) );
+            addInstr(env, X86Instr_Alu32R(Xalu_AND,
+                                          X86RMI_Imm(1), dst));
+            return dst;
+         }
+      }
+
+      /* 8Uto32(LDle(expr32)) */
+      if (e->Iex.Unop.op == Iop_8Uto32) {
+         DECLARE_PATTERN(p_LDle8_then_8Uto32);
+         DEFINE_PATTERN(p_LDle8_then_8Uto32,
+                        unop(Iop_8Uto32,
+                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
+         if (matchIRExpr(&mi,p_LDle8_then_8Uto32,e)) {
+            HReg dst = newVRegI(env);
+            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
+            addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
+            return dst;
+         }
+      }
+
+      /* 8Sto32(LDle(expr32)) */
+      if (e->Iex.Unop.op == Iop_8Sto32) {
+         DECLARE_PATTERN(p_LDle8_then_8Sto32);
+         DEFINE_PATTERN(p_LDle8_then_8Sto32,
+                        unop(Iop_8Sto32,
+                             IRExpr_Load(Iend_LE,Ity_I8,bind(0))) );
+         if (matchIRExpr(&mi,p_LDle8_then_8Sto32,e)) {
+            HReg dst = newVRegI(env);
+            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
+            addInstr(env, X86Instr_LoadEX(1,True,amode,dst));
+            return dst;
+         }
+      }
+
+      /* 16Uto32(LDle(expr32)) */
+      if (e->Iex.Unop.op == Iop_16Uto32) {
+         DECLARE_PATTERN(p_LDle16_then_16Uto32);
+         DEFINE_PATTERN(p_LDle16_then_16Uto32,
+                        unop(Iop_16Uto32,
+                             IRExpr_Load(Iend_LE,Ity_I16,bind(0))) );
+         if (matchIRExpr(&mi,p_LDle16_then_16Uto32,e)) {
+            HReg dst = newVRegI(env);
+            X86AMode* amode = iselIntExpr_AMode ( env, mi.bindee[0] );
+            addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
+            return dst;
+         }
+      }
+
+      /* 8Uto32(GET:I8) */
+      if (e->Iex.Unop.op == Iop_8Uto32) {
+         if (e->Iex.Unop.arg->tag == Iex_Get) {
+            HReg      dst;
+            X86AMode* amode;
+            vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I8);
+            dst = newVRegI(env);
+            amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
+                                hregX86_EBP());
+            addInstr(env, X86Instr_LoadEX(1,False,amode,dst));
+            return dst;
+         }
+      }
+
+      /* 16to32(GET:I16) */
+      if (e->Iex.Unop.op == Iop_16Uto32) {
+         if (e->Iex.Unop.arg->tag == Iex_Get) {
+            HReg      dst;
+            X86AMode* amode;
+            vassert(e->Iex.Unop.arg->Iex.Get.ty == Ity_I16);
+            dst = newVRegI(env);
+            amode = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset,
+                                hregX86_EBP());
+            addInstr(env, X86Instr_LoadEX(2,False,amode,dst));
+            return dst;
+         }
+      }
+
+      switch (e->Iex.Unop.op) {
+         case Iop_8Uto16:
+         case Iop_8Uto32:
+         case Iop_16Uto32: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            UInt mask = e->Iex.Unop.op==Iop_16Uto32 ? 0xFFFF : 0xFF;
+            addInstr(env, mk_iMOVsd_RR(src,dst) );
+            addInstr(env, X86Instr_Alu32R(Xalu_AND,
+                                          X86RMI_Imm(mask), dst));
+            return dst;
+         }
+         case Iop_8Sto16:
+         case Iop_8Sto32:
+         case Iop_16Sto32: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            UInt amt = e->Iex.Unop.op==Iop_16Sto32 ? 16 : 24;
+            addInstr(env, mk_iMOVsd_RR(src,dst) );
+            addInstr(env, X86Instr_Sh32(Xsh_SHL, amt, dst));
+            addInstr(env, X86Instr_Sh32(Xsh_SAR, amt, dst));
+            return dst;
+         }
+	 case Iop_Not8:
+	 case Iop_Not16:
+         case Iop_Not32: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, mk_iMOVsd_RR(src,dst) );
+            addInstr(env, X86Instr_Unary32(Xun_NOT,dst));
+            return dst;
+         }
+         case Iop_64HIto32: {
+            HReg rHi, rLo;
+            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+            return rHi; /* and abandon rLo .. poor wee thing :-) */
+         }
+         case Iop_64to32: {
+            HReg rHi, rLo;
+            iselInt64Expr(&rHi,&rLo, env, e->Iex.Unop.arg);
+            return rLo; /* similar stupid comment to the above ... */
+         }
+         case Iop_16HIto8:
+         case Iop_32HIto16: {
+            HReg dst  = newVRegI(env);
+            HReg src  = iselIntExpr_R(env, e->Iex.Unop.arg);
+            Int shift = e->Iex.Unop.op == Iop_16HIto8 ? 8 : 16;
+            addInstr(env, mk_iMOVsd_RR(src,dst) );
+            addInstr(env, X86Instr_Sh32(Xsh_SHR, shift, dst));
+            return dst;
+         }
+         case Iop_1Uto32:
+         case Iop_1Uto8: {
+            HReg dst         = newVRegI(env);
+            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+            addInstr(env, X86Instr_Set32(cond,dst));
+            return dst;
+         }
+         case Iop_1Sto8:
+         case Iop_1Sto16:
+         case Iop_1Sto32: {
+            /* could do better than this, but for now ... */
+            HReg dst         = newVRegI(env);
+            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+            addInstr(env, X86Instr_Set32(cond,dst));
+            addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, dst));
+            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
+            return dst;
+         }
+         case Iop_Ctz32: {
+            /* Count trailing zeroes, implemented by x86 'bsfl' */
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, X86Instr_Bsfr32(True,src,dst));
+            return dst;
+         }
+         case Iop_Clz32: {
+            /* Count leading zeroes.  Do 'bsrl' to establish the index
+               of the highest set bit, and subtract that value from
+               31. */
+            HReg tmp = newVRegI(env);
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, X86Instr_Bsfr32(False,src,tmp));
+            addInstr(env, X86Instr_Alu32R(Xalu_MOV, 
+                                          X86RMI_Imm(31), dst));
+            addInstr(env, X86Instr_Alu32R(Xalu_SUB,
+                                          X86RMI_Reg(tmp), dst));
+            return dst;
+         }
+
+         case Iop_CmpwNEZ32: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, mk_iMOVsd_RR(src,dst));
+            addInstr(env, X86Instr_Unary32(Xun_NEG,dst));
+            addInstr(env, X86Instr_Alu32R(Xalu_OR,
+                                          X86RMI_Reg(src), dst));
+            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, dst));
+            return dst;
+         }
+         case Iop_Left8:
+         case Iop_Left16:
+         case Iop_Left32: {
+            HReg dst = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, mk_iMOVsd_RR(src, dst));
+            addInstr(env, X86Instr_Unary32(Xun_NEG, dst));
+            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(src), dst));
+            return dst;
+         }
+
+         case Iop_V128to32: {
+            HReg      dst  = newVRegI(env);
+            HReg      vec  = iselVecExpr(env, e->Iex.Unop.arg);
+            X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
+            sub_from_esp(env, 16);
+            addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
+            addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(esp0), dst ));
+            add_to_esp(env, 16);
+            return dst;
+         }
+
+         /* ReinterpF32asI32(e) */
+         /* Given an IEEE754 single, produce an I32 with the same bit
+            pattern.  Keep stack 8-aligned even though only using 4
+            bytes. */
+         case Iop_ReinterpF32asI32: {
+            HReg rf   = iselFltExpr(env, e->Iex.Unop.arg);
+            HReg dst  = newVRegI(env);
+            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+            /* paranoia */
+            set_FPU_rounding_default(env);
+            /* subl $8, %esp */
+            sub_from_esp(env, 8);
+            /* gstF %rf, 0(%esp) */
+            addInstr(env,
+                     X86Instr_FpLdSt(False/*store*/, 4, rf, zero_esp));
+            /* movl 0(%esp), %dst */
+            addInstr(env, 
+                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), dst));
+            /* addl $8, %esp */
+            add_to_esp(env, 8);
+            return dst;
+         }
+
+         case Iop_16to8:
+         case Iop_32to8:
+         case Iop_32to16:
+            /* These are no-ops. */
+            return iselIntExpr_R(env, e->Iex.Unop.arg);
+
+         default: 
+            break;
+      }
+      break;
+   }
+
+   /* --------- GET --------- */
+   case Iex_Get: {
+      if (ty == Ity_I32) {
+         HReg dst = newVRegI(env);
+         addInstr(env, X86Instr_Alu32R(
+                          Xalu_MOV, 
+                          X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
+                                                 hregX86_EBP())),
+                          dst));
+         return dst;
+      }
+      if (ty == Ity_I8 || ty == Ity_I16) {
+         HReg dst = newVRegI(env);
+         addInstr(env, X86Instr_LoadEX(
+                          toUChar(ty==Ity_I8 ? 1 : 2),
+                          False,
+                          X86AMode_IR(e->Iex.Get.offset,hregX86_EBP()),
+                          dst));
+         return dst;
+      }
+      break;
+   }
+
+   case Iex_GetI: {
+      X86AMode* am 
+         = genGuestArrayOffset(
+              env, e->Iex.GetI.descr, 
+                   e->Iex.GetI.ix, e->Iex.GetI.bias );
+      HReg dst = newVRegI(env);
+      if (ty == Ity_I8) {
+         addInstr(env, X86Instr_LoadEX( 1, False, am, dst ));
+         return dst;
+      }
+      if (ty == Ity_I32) {
+         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), dst));
+         return dst;
+      }
+      break;
+   }
+
+   /* --------- CCALL --------- */
+   case Iex_CCall: {
+      HReg    dst = newVRegI(env);
+      vassert(ty == e->Iex.CCall.retty);
+
+      /* be very restrictive for now.  Only 32/64-bit ints allowed
+         for args, and 32 bits for return type. */
+      if (e->Iex.CCall.retty != Ity_I32)
+         goto irreducible;
+
+      /* Marshal args, do the call, clear stack. */
+      doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
+
+      addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), dst));
+      return dst;
+   }
+
+   /* --------- LITERAL --------- */
+   /* 32/16/8-bit literals */
+   case Iex_Const: {
+      X86RMI* rmi = iselIntExpr_RMI ( env, e );
+      HReg    r   = newVRegI(env);
+      addInstr(env, X86Instr_Alu32R(Xalu_MOV, rmi, r));
+      return r;
+   }
+
+   /* --------- MULTIPLEX --------- */
+   case Iex_Mux0X: {
+     if ((ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8)
+         && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
+        X86RM* r8;
+        HReg   rX  = iselIntExpr_R(env, e->Iex.Mux0X.exprX);
+        X86RM* r0  = iselIntExpr_RM(env, e->Iex.Mux0X.expr0);
+        HReg   dst = newVRegI(env);
+        addInstr(env, mk_iMOVsd_RR(rX,dst));
+        r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
+        addInstr(env, X86Instr_Test32(0xFF, r8));
+        addInstr(env, X86Instr_CMov32(Xcc_Z,r0,dst));
+        return dst;
+      }
+      break;
+   }
+
+   default: 
+   break;
+   } /* switch (e->tag) */
+
+   /* We get here if no pattern matched. */
+  irreducible:
+   ppIRExpr(e);
+   vpanic("iselIntExpr_R: cannot reduce tree");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expression auxiliaries              ---*/
+/*---------------------------------------------------------*/
+
+/* --------------------- AMODEs --------------------- */
+
+/* Return an AMode which computes the value of the specified
+   expression, possibly also adding insns to the code list as a
+   result.  The expression may only be a 32-bit one.
+*/
+
+static Bool sane_AMode ( X86AMode* am )
+{
+   switch (am->tag) {
+      case Xam_IR:
+         return 
+            toBool( hregClass(am->Xam.IR.reg) == HRcInt32
+                    && (hregIsVirtual(am->Xam.IR.reg)
+                        || am->Xam.IR.reg == hregX86_EBP()) );
+      case Xam_IRRS:
+         return 
+            toBool( hregClass(am->Xam.IRRS.base) == HRcInt32
+                    && hregIsVirtual(am->Xam.IRRS.base)
+                    && hregClass(am->Xam.IRRS.index) == HRcInt32
+                    && hregIsVirtual(am->Xam.IRRS.index) );
+      default:
+        vpanic("sane_AMode: unknown x86 amode tag");
+   }
+}
+
+static X86AMode* iselIntExpr_AMode ( ISelEnv* env, IRExpr* e )
+{
+   X86AMode* am = iselIntExpr_AMode_wrk(env, e);
+   vassert(sane_AMode(am));
+   return am;
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static X86AMode* iselIntExpr_AMode_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I32);
+
+   /* Add32( Add32(expr1, Shl32(expr2, simm)), imm32 ) */
+   if (e->tag == Iex_Binop
+       && e->Iex.Binop.op == Iop_Add32
+       && e->Iex.Binop.arg2->tag == Iex_Const
+       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32
+       && e->Iex.Binop.arg1->tag == Iex_Binop
+       && e->Iex.Binop.arg1->Iex.Binop.op == Iop_Add32
+       && e->Iex.Binop.arg1->Iex.Binop.arg2->tag == Iex_Binop
+       && e->Iex.Binop.arg1->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
+       && e->Iex.Binop.arg1
+           ->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
+       && e->Iex.Binop.arg1
+           ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
+      UInt shift = e->Iex.Binop.arg1
+                    ->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+      UInt imm32 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
+      if (shift == 1 || shift == 2 || shift == 3) {
+         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1->Iex.Binop.arg1);
+         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg1
+                                       ->Iex.Binop.arg2->Iex.Binop.arg1 );
+         return X86AMode_IRRS(imm32, r1, r2, shift);
+      }
+   }
+
+   /* Add32(expr1, Shl32(expr2, imm)) */
+   if (e->tag == Iex_Binop
+       && e->Iex.Binop.op == Iop_Add32
+       && e->Iex.Binop.arg2->tag == Iex_Binop
+       && e->Iex.Binop.arg2->Iex.Binop.op == Iop_Shl32
+       && e->Iex.Binop.arg2->Iex.Binop.arg2->tag == Iex_Const
+       && e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8) {
+      UInt shift = e->Iex.Binop.arg2->Iex.Binop.arg2->Iex.Const.con->Ico.U8;
+      if (shift == 1 || shift == 2 || shift == 3) {
+         HReg r1 = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         HReg r2 = iselIntExpr_R(env, e->Iex.Binop.arg2->Iex.Binop.arg1 );
+         return X86AMode_IRRS(0, r1, r2, shift);
+      }
+   }
+
+   /* Add32(expr,i) */
+   if (e->tag == Iex_Binop 
+       && e->Iex.Binop.op == Iop_Add32
+       && e->Iex.Binop.arg2->tag == Iex_Const
+       && e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U32) {
+      HReg r1 = iselIntExpr_R(env,  e->Iex.Binop.arg1);
+      return X86AMode_IR(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32, r1);
+   }
+
+   /* Doesn't match anything in particular.  Generate it into
+      a register and use that. */
+   {
+      HReg r1 = iselIntExpr_R(env, e);
+      return X86AMode_IR(0, r1);
+   }
+}
+
+
+/* --------------------- RMIs --------------------- */
+
+/* Similarly, calculate an expression into an X86RMI operand.  As with
+   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
+
+static X86RMI* iselIntExpr_RMI ( ISelEnv* env, IRExpr* e )
+{
+   X86RMI* rmi = iselIntExpr_RMI_wrk(env, e);
+   /* sanity checks ... */
+   switch (rmi->tag) {
+      case Xrmi_Imm:
+         return rmi;
+      case Xrmi_Reg:
+         vassert(hregClass(rmi->Xrmi.Reg.reg) == HRcInt32);
+         vassert(hregIsVirtual(rmi->Xrmi.Reg.reg));
+         return rmi;
+      case Xrmi_Mem:
+         vassert(sane_AMode(rmi->Xrmi.Mem.am));
+         return rmi;
+      default:
+         vpanic("iselIntExpr_RMI: unknown x86 RMI tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static X86RMI* iselIntExpr_RMI_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
+
+   /* special case: immediate */
+   if (e->tag == Iex_Const) {
+      UInt u;
+      switch (e->Iex.Const.con->tag) {
+         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
+         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
+         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
+         default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
+      }
+      return X86RMI_Imm(u);
+   }
+
+   /* special case: 32-bit GET */
+   if (e->tag == Iex_Get && ty == Ity_I32) {
+      return X86RMI_Mem(X86AMode_IR(e->Iex.Get.offset,
+                                    hregX86_EBP()));
+   }
+
+   /* special case: 32-bit load from memory */
+   if (e->tag == Iex_Load && ty == Ity_I32 
+       && e->Iex.Load.end == Iend_LE) {
+      X86AMode* am = iselIntExpr_AMode(env, e->Iex.Load.addr);
+      return X86RMI_Mem(am);
+   }
+
+   /* default case: calculate into a register and return that */
+   {
+      HReg r = iselIntExpr_R ( env, e );
+      return X86RMI_Reg(r);
+   }
+}
+
+
+/* --------------------- RIs --------------------- */
+
+/* Calculate an expression into an X86RI operand.  As with
+   iselIntExpr_R, the expression can have type 32, 16 or 8 bits. */
+
+static X86RI* iselIntExpr_RI ( ISelEnv* env, IRExpr* e )
+{
+   X86RI* ri = iselIntExpr_RI_wrk(env, e);
+   /* sanity checks ... */
+   switch (ri->tag) {
+      case Xri_Imm:
+         return ri;
+      case Xri_Reg:
+         vassert(hregClass(ri->Xri.Reg.reg) == HRcInt32);
+         vassert(hregIsVirtual(ri->Xri.Reg.reg));
+         return ri;
+      default:
+         vpanic("iselIntExpr_RI: unknown x86 RI tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static X86RI* iselIntExpr_RI_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
+
+   /* special case: immediate */
+   if (e->tag == Iex_Const) {
+      UInt u;
+      switch (e->Iex.Const.con->tag) {
+         case Ico_U32: u = e->Iex.Const.con->Ico.U32; break;
+         case Ico_U16: u = 0xFFFF & (e->Iex.Const.con->Ico.U16); break;
+         case Ico_U8:  u = 0xFF   & (e->Iex.Const.con->Ico.U8); break;
+         default: vpanic("iselIntExpr_RMI.Iex_Const(x86h)");
+      }
+      return X86RI_Imm(u);
+   }
+
+   /* default case: calculate into a register and return that */
+   {
+      HReg r = iselIntExpr_R ( env, e );
+      return X86RI_Reg(r);
+   }
+}
+
+
+/* --------------------- RMs --------------------- */
+
+/* Similarly, calculate an expression into an X86RM operand.  As with
+   iselIntExpr_R, the expression can have type 32, 16 or 8 bits.  */
+
+static X86RM* iselIntExpr_RM ( ISelEnv* env, IRExpr* e )
+{
+   X86RM* rm = iselIntExpr_RM_wrk(env, e);
+   /* sanity checks ... */
+   switch (rm->tag) {
+      case Xrm_Reg:
+         vassert(hregClass(rm->Xrm.Reg.reg) == HRcInt32);
+         vassert(hregIsVirtual(rm->Xrm.Reg.reg));
+         return rm;
+      case Xrm_Mem:
+         vassert(sane_AMode(rm->Xrm.Mem.am));
+         return rm;
+      default:
+         vpanic("iselIntExpr_RM: unknown x86 RM tag");
+   }
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static X86RM* iselIntExpr_RM_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8);
+
+   /* special case: 32-bit GET */
+   if (e->tag == Iex_Get && ty == Ity_I32) {
+      return X86RM_Mem(X86AMode_IR(e->Iex.Get.offset,
+                                   hregX86_EBP()));
+   }
+
+   /* special case: load from memory */
+
+   /* default case: calculate into a register and return that */
+   {
+      HReg r = iselIntExpr_R ( env, e );
+      return X86RM_Reg(r);
+   }
+}
+
+
+/* --------------------- CONDCODE --------------------- */
+
+/* Generate code to evaluated a bit-typed expression, returning the
+   condition code which would correspond when the expression would
+   notionally have returned 1. */
+
+static X86CondCode iselCondCode ( ISelEnv* env, IRExpr* e )
+{
+   /* Uh, there's nothing we can sanity check here, unfortunately. */
+   return iselCondCode_wrk(env,e);
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static X86CondCode iselCondCode_wrk ( ISelEnv* env, IRExpr* e )
+{
+   MatchInfo mi;
+
+   vassert(e);
+   vassert(typeOfIRExpr(env->type_env,e) == Ity_I1);
+
+   /* var */
+   if (e->tag == Iex_RdTmp) {
+      HReg r32 = lookupIRTemp(env, e->Iex.RdTmp.tmp);
+      /* Test32 doesn't modify r32; so this is OK. */
+      addInstr(env, X86Instr_Test32(1,X86RM_Reg(r32)));
+      return Xcc_NZ;
+   }
+
+   /* Constant 1:Bit */
+   if (e->tag == Iex_Const) {
+      HReg r;
+      vassert(e->Iex.Const.con->tag == Ico_U1);
+      vassert(e->Iex.Const.con->Ico.U1 == True 
+              || e->Iex.Const.con->Ico.U1 == False);
+      r = newVRegI(env);
+      addInstr(env, X86Instr_Alu32R(Xalu_MOV,X86RMI_Imm(0),r));
+      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(r),r));
+      return e->Iex.Const.con->Ico.U1 ? Xcc_Z : Xcc_NZ;
+   }
+
+   /* Not1(e) */
+   if (e->tag == Iex_Unop && e->Iex.Unop.op == Iop_Not1) {
+      /* Generate code for the arg, and negate the test condition */
+      return 1 ^ iselCondCode(env, e->Iex.Unop.arg);
+   }
+
+   /* --- patterns rooted at: 32to1 --- */
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_32to1) {
+      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
+      addInstr(env, X86Instr_Test32(1,rm));
+      return Xcc_NZ;
+   }
+
+   /* --- patterns rooted at: CmpNEZ8 --- */
+
+   /* CmpNEZ8(x) */
+   if (e->tag == Iex_Unop 
+       && e->Iex.Unop.op == Iop_CmpNEZ8) {
+      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
+      addInstr(env, X86Instr_Test32(0xFF,rm));
+      return Xcc_NZ;
+   }
+
+   /* --- patterns rooted at: CmpNEZ16 --- */
+
+   /* CmpNEZ16(x) */
+   if (e->tag == Iex_Unop 
+       && e->Iex.Unop.op == Iop_CmpNEZ16) {
+      X86RM* rm = iselIntExpr_RM(env, e->Iex.Unop.arg);
+      addInstr(env, X86Instr_Test32(0xFFFF,rm));
+      return Xcc_NZ;
+   }
+
+   /* --- patterns rooted at: CmpNEZ32 --- */
+
+   /* CmpNEZ32(And32(x,y)) */
+   {
+      DECLARE_PATTERN(p_CmpNEZ32_And32);
+      DEFINE_PATTERN(p_CmpNEZ32_And32,
+                     unop(Iop_CmpNEZ32, binop(Iop_And32, bind(0), bind(1))));
+      if (matchIRExpr(&mi, p_CmpNEZ32_And32, e)) {
+         HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
+         X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
+         HReg    tmp  = newVRegI(env);
+         addInstr(env, mk_iMOVsd_RR(r0, tmp));
+         addInstr(env, X86Instr_Alu32R(Xalu_AND,rmi1,tmp));
+         return Xcc_NZ;
+      }
+   }
+
+   /* CmpNEZ32(Or32(x,y)) */
+   {
+      DECLARE_PATTERN(p_CmpNEZ32_Or32);
+      DEFINE_PATTERN(p_CmpNEZ32_Or32,
+                     unop(Iop_CmpNEZ32, binop(Iop_Or32, bind(0), bind(1))));
+      if (matchIRExpr(&mi, p_CmpNEZ32_Or32, e)) {
+         HReg    r0   = iselIntExpr_R(env, mi.bindee[0]);
+         X86RMI* rmi1 = iselIntExpr_RMI(env, mi.bindee[1]);
+         HReg    tmp  = newVRegI(env);
+         addInstr(env, mk_iMOVsd_RR(r0, tmp));
+         addInstr(env, X86Instr_Alu32R(Xalu_OR,rmi1,tmp));
+         return Xcc_NZ;
+      }
+   }
+
+   /* CmpNEZ32(GET(..):I32) */
+   if (e->tag == Iex_Unop 
+       && e->Iex.Unop.op == Iop_CmpNEZ32
+       && e->Iex.Unop.arg->tag == Iex_Get) {
+      X86AMode* am = X86AMode_IR(e->Iex.Unop.arg->Iex.Get.offset, 
+                                 hregX86_EBP());
+      addInstr(env, X86Instr_Alu32M(Xalu_CMP, X86RI_Imm(0), am));
+      return Xcc_NZ;
+   }
+
+   /* CmpNEZ32(x) */
+   if (e->tag == Iex_Unop 
+       && e->Iex.Unop.op == Iop_CmpNEZ32) {
+      HReg    r1   = iselIntExpr_R(env, e->Iex.Unop.arg);
+      X86RMI* rmi2 = X86RMI_Imm(0);
+      addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
+      return Xcc_NZ;
+   }
+
+   /* --- patterns rooted at: CmpNEZ64 --- */
+
+   /* CmpNEZ64(Or64(x,y)) */
+   {
+      DECLARE_PATTERN(p_CmpNEZ64_Or64);
+      DEFINE_PATTERN(p_CmpNEZ64_Or64,
+                     unop(Iop_CmpNEZ64, binop(Iop_Or64, bind(0), bind(1))));
+      if (matchIRExpr(&mi, p_CmpNEZ64_Or64, e)) {
+         HReg    hi1, lo1, hi2, lo2;
+         HReg    tmp  = newVRegI(env);
+         iselInt64Expr( &hi1, &lo1, env, mi.bindee[0] );
+         addInstr(env, mk_iMOVsd_RR(hi1, tmp));
+         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo1),tmp));
+         iselInt64Expr( &hi2, &lo2, env, mi.bindee[1] );
+         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(hi2),tmp));
+         addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo2),tmp));
+         return Xcc_NZ;
+      }
+   }
+
+   /* CmpNEZ64(x) */
+   if (e->tag == Iex_Unop 
+       && e->Iex.Unop.op == Iop_CmpNEZ64) {
+      HReg hi, lo;
+      HReg tmp = newVRegI(env);
+      iselInt64Expr( &hi, &lo, env, e->Iex.Unop.arg );
+      addInstr(env, mk_iMOVsd_RR(hi, tmp));
+      addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(lo), tmp));
+      return Xcc_NZ;
+   }
+
+   /* --- patterns rooted at: Cmp{EQ,NE}{8,16} --- */
+
+   /* CmpEQ8 / CmpNE8 */
+   if (e->tag == Iex_Binop 
+       && (e->Iex.Binop.op == Iop_CmpEQ8
+           || e->Iex.Binop.op == Iop_CmpNE8
+           || e->Iex.Binop.op == Iop_CasCmpEQ8
+           || e->Iex.Binop.op == Iop_CasCmpNE8)) {
+      if (isZeroU8(e->Iex.Binop.arg2)) {
+         HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r1)));
+         switch (e->Iex.Binop.op) {
+            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
+            case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
+            default: vpanic("iselCondCode(x86): CmpXX8(expr,0:I8)");
+         }
+      } else {
+         HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+         X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+         HReg    r    = newVRegI(env);
+         addInstr(env, mk_iMOVsd_RR(r1,r));
+         addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
+         addInstr(env, X86Instr_Test32(0xFF,X86RM_Reg(r)));
+         switch (e->Iex.Binop.op) {
+            case Iop_CmpEQ8: case Iop_CasCmpEQ8: return Xcc_Z;
+            case Iop_CmpNE8: case Iop_CasCmpNE8: return Xcc_NZ;
+            default: vpanic("iselCondCode(x86): CmpXX8(expr,expr)");
+         }
+      }
+   }
+
+   /* CmpEQ16 / CmpNE16 */
+   if (e->tag == Iex_Binop 
+       && (e->Iex.Binop.op == Iop_CmpEQ16
+           || e->Iex.Binop.op == Iop_CmpNE16
+           || e->Iex.Binop.op == Iop_CasCmpEQ16
+           || e->Iex.Binop.op == Iop_CasCmpNE16)) {
+      HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+      HReg    r    = newVRegI(env);
+      addInstr(env, mk_iMOVsd_RR(r1,r));
+      addInstr(env, X86Instr_Alu32R(Xalu_XOR,rmi2,r));
+      addInstr(env, X86Instr_Test32(0xFFFF,X86RM_Reg(r)));
+      switch (e->Iex.Binop.op) {
+         case Iop_CmpEQ16: case Iop_CasCmpEQ16: return Xcc_Z;
+         case Iop_CmpNE16: case Iop_CasCmpNE16: return Xcc_NZ;
+         default: vpanic("iselCondCode(x86): CmpXX16");
+      }
+   }
+
+   /* Cmp*32*(x,y) */
+   if (e->tag == Iex_Binop 
+       && (e->Iex.Binop.op == Iop_CmpEQ32
+           || e->Iex.Binop.op == Iop_CmpNE32
+           || e->Iex.Binop.op == Iop_CmpLT32S
+           || e->Iex.Binop.op == Iop_CmpLT32U
+           || e->Iex.Binop.op == Iop_CmpLE32S
+           || e->Iex.Binop.op == Iop_CmpLE32U
+           || e->Iex.Binop.op == Iop_CasCmpEQ32
+           || e->Iex.Binop.op == Iop_CasCmpNE32)) {
+      HReg    r1   = iselIntExpr_R(env, e->Iex.Binop.arg1);
+      X86RMI* rmi2 = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+      addInstr(env, X86Instr_Alu32R(Xalu_CMP,rmi2,r1));
+      switch (e->Iex.Binop.op) {
+         case Iop_CmpEQ32: case Iop_CasCmpEQ32: return Xcc_Z;
+         case Iop_CmpNE32: case Iop_CasCmpNE32: return Xcc_NZ;
+         case Iop_CmpLT32S: return Xcc_L;
+         case Iop_CmpLT32U: return Xcc_B;
+         case Iop_CmpLE32S: return Xcc_LE;
+         case Iop_CmpLE32U: return Xcc_BE;
+         default: vpanic("iselCondCode(x86): CmpXX32");
+      }
+   }
+
+   /* CmpNE64 */
+   if (e->tag == Iex_Binop 
+       && (e->Iex.Binop.op == Iop_CmpNE64
+           || e->Iex.Binop.op == Iop_CmpEQ64)) {
+      HReg hi1, hi2, lo1, lo2;
+      HReg tHi = newVRegI(env);
+      HReg tLo = newVRegI(env);
+      iselInt64Expr( &hi1, &lo1, env, e->Iex.Binop.arg1 );
+      iselInt64Expr( &hi2, &lo2, env, e->Iex.Binop.arg2 );
+      addInstr(env, mk_iMOVsd_RR(hi1, tHi));
+      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(hi2), tHi));
+      addInstr(env, mk_iMOVsd_RR(lo1, tLo));
+      addInstr(env, X86Instr_Alu32R(Xalu_XOR,X86RMI_Reg(lo2), tLo));
+      addInstr(env, X86Instr_Alu32R(Xalu_OR,X86RMI_Reg(tHi), tLo));
+      switch (e->Iex.Binop.op) {
+         case Iop_CmpNE64: return Xcc_NZ;
+         case Iop_CmpEQ64: return Xcc_Z;
+         default: vpanic("iselCondCode(x86): CmpXX64");
+      }
+   }
+
+   ppIRExpr(e);
+   vpanic("iselCondCode");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Integer expressions (64 bit)                ---*/
+/*---------------------------------------------------------*/
+
+/* Compute a 64-bit value into a register pair, which is returned as
+   the first two parameters.  As with iselIntExpr_R, these may be
+   either real or virtual regs; in any case they must not be changed
+   by subsequent code emitted by the caller.  */
+
+static void iselInt64Expr ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
+{
+   iselInt64Expr_wrk(rHi, rLo, env, e);
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(*rHi) == HRcInt32);
+   vassert(hregIsVirtual(*rHi));
+   vassert(hregClass(*rLo) == HRcInt32);
+   vassert(hregIsVirtual(*rLo));
+}
+
+/* DO NOT CALL THIS DIRECTLY ! */
+static void iselInt64Expr_wrk ( HReg* rHi, HReg* rLo, ISelEnv* env, IRExpr* e )
+{
+   MatchInfo mi;
+   HWord fn = 0; /* helper fn for most SIMD64 stuff */
+   vassert(e);
+   vassert(typeOfIRExpr(env->type_env,e) == Ity_I64);
+
+   /* 64-bit literal */
+   if (e->tag == Iex_Const) {
+      ULong w64 = e->Iex.Const.con->Ico.U64;
+      UInt  wHi = toUInt(w64 >> 32);
+      UInt  wLo = toUInt(w64);
+      HReg  tLo = newVRegI(env);
+      HReg  tHi = newVRegI(env);
+      vassert(e->Iex.Const.con->tag == Ico_U64);
+      if (wLo == wHi) {
+         /* Save a precious Int register in this special case. */
+         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
+         *rHi = tLo;
+         *rLo = tLo;
+      } else {
+         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wHi), tHi));
+         addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(wLo), tLo));
+         *rHi = tHi;
+         *rLo = tLo;
+      }
+      return;
+   }
+
+   /* read 64-bit IRTemp */
+   if (e->tag == Iex_RdTmp) {
+      lookupIRTemp64( rHi, rLo, env, e->Iex.RdTmp.tmp);
+      return;
+   }
+
+   /* 64-bit load */
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+      HReg     tLo, tHi;
+      X86AMode *am0, *am4;
+      vassert(e->Iex.Load.ty == Ity_I64);
+      tLo = newVRegI(env);
+      tHi = newVRegI(env);
+      am0 = iselIntExpr_AMode(env, e->Iex.Load.addr);
+      am4 = advance4(am0);
+      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am0), tLo ));
+      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   /* 64-bit GET */
+   if (e->tag == Iex_Get) {
+      X86AMode* am  = X86AMode_IR(e->Iex.Get.offset, hregX86_EBP());
+      X86AMode* am4 = advance4(am);
+      HReg tLo = newVRegI(env);
+      HReg tHi = newVRegI(env);
+      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
+      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   /* 64-bit GETI */
+   if (e->tag == Iex_GetI) {
+      X86AMode* am 
+         = genGuestArrayOffset( env, e->Iex.GetI.descr, 
+                                     e->Iex.GetI.ix, e->Iex.GetI.bias );
+      X86AMode* am4 = advance4(am);
+      HReg tLo = newVRegI(env);
+      HReg tHi = newVRegI(env);
+      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am), tLo ));
+      addInstr(env, X86Instr_Alu32R( Xalu_MOV, X86RMI_Mem(am4), tHi ));
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   /* 64-bit Mux0X: Mux0X(g, expr, 0:I64) */
+   if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.exprX)) {
+      X86RM* r8;
+      HReg e0Lo, e0Hi;
+      HReg tLo = newVRegI(env);
+      HReg tHi = newVRegI(env);
+      X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+      iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
+      r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
+      addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
+      addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
+      addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
+      addInstr(env, X86Instr_Test32(0xFF, r8));
+      addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tHi));
+      addInstr(env, X86Instr_CMov32(Xcc_NZ,X86RM_Mem(zero_esp),tLo));
+      add_to_esp(env, 4);
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+   /* 64-bit Mux0X: Mux0X(g, 0:I64, expr) */
+   if (e->tag == Iex_Mux0X && isZeroU64(e->Iex.Mux0X.expr0)) {
+      X86RM* r8;
+      HReg e0Lo, e0Hi;
+      HReg tLo = newVRegI(env);
+      HReg tHi = newVRegI(env);
+      X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+      iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.exprX);
+      r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
+      addInstr(env, mk_iMOVsd_RR( e0Hi, tHi ) );
+      addInstr(env, mk_iMOVsd_RR( e0Lo, tLo ) );
+      addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
+      addInstr(env, X86Instr_Test32(0xFF, r8));
+      addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tHi));
+      addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Mem(zero_esp),tLo));
+      add_to_esp(env, 4);
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   /* 64-bit Mux0X: Mux0X(g, expr, expr) */
+   if (e->tag == Iex_Mux0X) {
+      X86RM* r8;
+      HReg e0Lo, e0Hi, eXLo, eXHi;
+      HReg tLo = newVRegI(env);
+      HReg tHi = newVRegI(env);
+      iselInt64Expr(&e0Hi, &e0Lo, env, e->Iex.Mux0X.expr0);
+      iselInt64Expr(&eXHi, &eXLo, env, e->Iex.Mux0X.exprX);
+      addInstr(env, mk_iMOVsd_RR(eXHi, tHi));
+      addInstr(env, mk_iMOVsd_RR(eXLo, tLo));
+      r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
+      addInstr(env, X86Instr_Test32(0xFF, r8));
+      /* This assumes the first cmov32 doesn't trash the condition
+         codes, so they are still available for the second cmov32 */
+      addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Hi),tHi));
+      addInstr(env, X86Instr_CMov32(Xcc_Z,X86RM_Reg(e0Lo),tLo));
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   /* --------- BINARY ops --------- */
+   if (e->tag == Iex_Binop) {
+      switch (e->Iex.Binop.op) {
+         /* 32 x 32 -> 64 multiply */
+         case Iop_MullU32:
+         case Iop_MullS32: {
+            /* get one operand into %eax, and the other into a R/M.
+               Need to make an educated guess about which is better in
+               which. */
+            HReg   tLo    = newVRegI(env);
+            HReg   tHi    = newVRegI(env);
+            Bool   syned  = toBool(e->Iex.Binop.op == Iop_MullS32);
+            X86RM* rmLeft = iselIntExpr_RM(env, e->Iex.Binop.arg1);
+            HReg   rRight = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            addInstr(env, mk_iMOVsd_RR(rRight, hregX86_EAX()));
+            addInstr(env, X86Instr_MulL(syned, rmLeft));
+            /* Result is now in EDX:EAX.  Tell the caller. */
+            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
+            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* 64 x 32 -> (32(rem),32(div)) division */
+         case Iop_DivModU64to32:
+         case Iop_DivModS64to32: {
+            /* Get the 64-bit operand into edx:eax, and the other into
+               any old R/M. */
+            HReg sHi, sLo;
+            HReg   tLo     = newVRegI(env);
+            HReg   tHi     = newVRegI(env);
+            Bool   syned   = toBool(e->Iex.Binop.op == Iop_DivModS64to32);
+            X86RM* rmRight = iselIntExpr_RM(env, e->Iex.Binop.arg2);
+            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
+            addInstr(env, mk_iMOVsd_RR(sHi, hregX86_EDX()));
+            addInstr(env, mk_iMOVsd_RR(sLo, hregX86_EAX()));
+            addInstr(env, X86Instr_Div(syned, rmRight));
+            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
+            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* Or64/And64/Xor64 */
+         case Iop_Or64:
+         case Iop_And64:
+         case Iop_Xor64: {
+            HReg xLo, xHi, yLo, yHi;
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            X86AluOp op = e->Iex.Binop.op==Iop_Or64 ? Xalu_OR
+                          : e->Iex.Binop.op==Iop_And64 ? Xalu_AND
+                          : Xalu_XOR;
+            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
+            addInstr(env, mk_iMOVsd_RR(xHi, tHi));
+            addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yHi), tHi));
+            addInstr(env, mk_iMOVsd_RR(xLo, tLo));
+            addInstr(env, X86Instr_Alu32R(op, X86RMI_Reg(yLo), tLo));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* Add64/Sub64 */
+         case Iop_Add64:
+            if (e->Iex.Binop.arg2->tag == Iex_Const) {
+               /* special case Add64(e, const) */
+               ULong w64 = e->Iex.Binop.arg2->Iex.Const.con->Ico.U64;
+               UInt  wHi = toUInt(w64 >> 32);
+               UInt  wLo = toUInt(w64);
+               HReg  tLo = newVRegI(env);
+               HReg  tHi = newVRegI(env);
+               HReg  xLo, xHi;
+               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U64);
+               iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+               addInstr(env, mk_iMOVsd_RR(xHi, tHi));
+               addInstr(env, mk_iMOVsd_RR(xLo, tLo));
+               addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Imm(wLo), tLo));
+               addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Imm(wHi), tHi));
+               *rHi = tHi;
+               *rLo = tLo;
+               return;
+            }
+            /* else fall through to the generic case */
+         case Iop_Sub64: {
+            HReg xLo, xHi, yLo, yHi;
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+            addInstr(env, mk_iMOVsd_RR(xHi, tHi));
+            addInstr(env, mk_iMOVsd_RR(xLo, tLo));
+            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
+            if (e->Iex.Binop.op==Iop_Add64) {
+               addInstr(env, X86Instr_Alu32R(Xalu_ADD, X86RMI_Reg(yLo), tLo));
+               addInstr(env, X86Instr_Alu32R(Xalu_ADC, X86RMI_Reg(yHi), tHi));
+            } else {
+               addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
+               addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
+            }
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* 32HLto64(e1,e2) */
+         case Iop_32HLto64:
+            *rHi = iselIntExpr_R(env, e->Iex.Binop.arg1);
+            *rLo = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            return;
+
+         /* 64-bit shifts */
+         case Iop_Shl64: {
+            /* We use the same ingenious scheme as gcc.  Put the value
+               to be shifted into %hi:%lo, and the shift amount into
+               %cl.  Then (dsts on right, a la ATT syntax):
+ 
+               shldl %cl, %lo, %hi   -- make %hi be right for the
+                                     -- shift amt %cl % 32
+               shll  %cl, %lo        -- make %lo be right for the
+                                     -- shift amt %cl % 32
+
+               Now, if (shift amount % 64) is in the range 32 .. 63,
+               we have to do a fixup, which puts the result low half
+               into the result high half, and zeroes the low half:
+
+               testl $32, %ecx
+
+               cmovnz %lo, %hi
+               movl $0, %tmp         -- sigh; need yet another reg
+               cmovnz %tmp, %lo 
+            */
+            HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
+            tLo = newVRegI(env);
+            tHi = newVRegI(env);
+            tTemp = newVRegI(env);
+            rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
+            addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
+            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
+            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
+            /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
+               and those regs are legitimately modifiable. */
+            addInstr(env, X86Instr_Sh3232(Xsh_SHL, 0/*%cl*/, tLo, tHi));
+            addInstr(env, X86Instr_Sh32(Xsh_SHL, 0/*%cl*/, tLo));
+            addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
+            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tLo), tHi));
+            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
+            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tLo));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         case Iop_Shr64: {
+            /* We use the same ingenious scheme as gcc.  Put the value
+               to be shifted into %hi:%lo, and the shift amount into
+               %cl.  Then:
+ 
+               shrdl %cl, %hi, %lo   -- make %lo be right for the
+                                     -- shift amt %cl % 32
+               shrl  %cl, %hi        -- make %hi be right for the
+                                     -- shift amt %cl % 32
+
+               Now, if (shift amount % 64) is in the range 32 .. 63,
+               we have to do a fixup, which puts the result high half
+               into the result low half, and zeroes the high half:
+
+               testl $32, %ecx
+
+               cmovnz %hi, %lo
+               movl $0, %tmp         -- sigh; need yet another reg
+               cmovnz %tmp, %hi
+            */
+            HReg rAmt, sHi, sLo, tHi, tLo, tTemp;
+            tLo = newVRegI(env);
+            tHi = newVRegI(env);
+            tTemp = newVRegI(env);
+            rAmt = iselIntExpr_R(env, e->Iex.Binop.arg2);
+            iselInt64Expr(&sHi,&sLo, env, e->Iex.Binop.arg1);
+            addInstr(env, mk_iMOVsd_RR(rAmt, hregX86_ECX()));
+            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
+            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
+            /* Ok.  Now shift amt is in %ecx, and value is in tHi/tLo
+               and those regs are legitimately modifiable. */
+            addInstr(env, X86Instr_Sh3232(Xsh_SHR, 0/*%cl*/, tHi, tLo));
+            addInstr(env, X86Instr_Sh32(Xsh_SHR, 0/*%cl*/, tHi));
+            addInstr(env, X86Instr_Test32(32, X86RM_Reg(hregX86_ECX())));
+            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tHi), tLo));
+            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tTemp));
+            addInstr(env, X86Instr_CMov32(Xcc_NZ, X86RM_Reg(tTemp), tHi));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* F64 -> I64 */
+         /* Sigh, this is an almost exact copy of the F64 -> I32/I16
+            case.  Unfortunately I see no easy way to avoid the
+            duplication. */
+         case Iop_F64toI64S: {
+            HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+
+            /* Used several times ... */
+            /* Careful ... this sharing is only safe because
+	       zero_esp/four_esp do not hold any registers which the
+	       register allocator could attempt to swizzle later. */
+            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+            X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
+
+            /* rf now holds the value to be converted, and rrm holds
+               the rounding mode value, encoded as per the
+               IRRoundingMode enum.  The first thing to do is set the
+               FPU's rounding mode accordingly. */
+
+            /* Create a space for the format conversion. */
+            /* subl $8, %esp */
+            sub_from_esp(env, 8);
+
+            /* Set host rounding mode */
+            set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+
+            /* gistll %rf, 0(%esp) */
+            addInstr(env, X86Instr_FpLdStI(False/*store*/, 8, rf, zero_esp));
+
+            /* movl 0(%esp), %dstLo */
+            /* movl 4(%esp), %dstHi */
+            addInstr(env, X86Instr_Alu32R(
+                             Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
+            addInstr(env, X86Instr_Alu32R(
+                             Xalu_MOV, X86RMI_Mem(four_esp), tHi));
+
+            /* Restore default FPU rounding. */
+            set_FPU_rounding_default( env );
+
+            /* addl $8, %esp */
+            add_to_esp(env, 8);
+
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         case Iop_Add8x8:
+            fn = (HWord)h_generic_calc_Add8x8; goto binnish;
+         case Iop_Add16x4:
+            fn = (HWord)h_generic_calc_Add16x4; goto binnish;
+         case Iop_Add32x2:
+            fn = (HWord)h_generic_calc_Add32x2; goto binnish;
+
+         case Iop_Avg8Ux8:
+            fn = (HWord)h_generic_calc_Avg8Ux8; goto binnish;
+         case Iop_Avg16Ux4:
+            fn = (HWord)h_generic_calc_Avg16Ux4; goto binnish;
+
+         case Iop_CmpEQ8x8:
+            fn = (HWord)h_generic_calc_CmpEQ8x8; goto binnish;
+         case Iop_CmpEQ16x4:
+            fn = (HWord)h_generic_calc_CmpEQ16x4; goto binnish;
+         case Iop_CmpEQ32x2:
+            fn = (HWord)h_generic_calc_CmpEQ32x2; goto binnish;
+
+         case Iop_CmpGT8Sx8:
+            fn = (HWord)h_generic_calc_CmpGT8Sx8; goto binnish;
+         case Iop_CmpGT16Sx4:
+            fn = (HWord)h_generic_calc_CmpGT16Sx4; goto binnish;
+         case Iop_CmpGT32Sx2:
+            fn = (HWord)h_generic_calc_CmpGT32Sx2; goto binnish;
+
+         case Iop_InterleaveHI8x8:
+            fn = (HWord)h_generic_calc_InterleaveHI8x8; goto binnish;
+         case Iop_InterleaveLO8x8:
+            fn = (HWord)h_generic_calc_InterleaveLO8x8; goto binnish;
+         case Iop_InterleaveHI16x4:
+            fn = (HWord)h_generic_calc_InterleaveHI16x4; goto binnish;
+         case Iop_InterleaveLO16x4:
+            fn = (HWord)h_generic_calc_InterleaveLO16x4; goto binnish;
+         case Iop_InterleaveHI32x2:
+            fn = (HWord)h_generic_calc_InterleaveHI32x2; goto binnish;
+         case Iop_InterleaveLO32x2:
+            fn = (HWord)h_generic_calc_InterleaveLO32x2; goto binnish;
+         case Iop_CatOddLanes16x4:
+            fn = (HWord)h_generic_calc_CatOddLanes16x4; goto binnish;
+         case Iop_CatEvenLanes16x4:
+            fn = (HWord)h_generic_calc_CatEvenLanes16x4; goto binnish;
+         case Iop_Perm8x8:
+            fn = (HWord)h_generic_calc_Perm8x8; goto binnish;
+
+         case Iop_Max8Ux8:
+            fn = (HWord)h_generic_calc_Max8Ux8; goto binnish;
+         case Iop_Max16Sx4:
+            fn = (HWord)h_generic_calc_Max16Sx4; goto binnish;
+         case Iop_Min8Ux8:
+            fn = (HWord)h_generic_calc_Min8Ux8; goto binnish;
+         case Iop_Min16Sx4:
+            fn = (HWord)h_generic_calc_Min16Sx4; goto binnish;
+
+         case Iop_Mul16x4:
+            fn = (HWord)h_generic_calc_Mul16x4; goto binnish;
+         case Iop_Mul32x2:
+            fn = (HWord)h_generic_calc_Mul32x2; goto binnish;
+         case Iop_MulHi16Sx4:
+            fn = (HWord)h_generic_calc_MulHi16Sx4; goto binnish;
+         case Iop_MulHi16Ux4:
+            fn = (HWord)h_generic_calc_MulHi16Ux4; goto binnish;
+
+         case Iop_QAdd8Sx8:
+            fn = (HWord)h_generic_calc_QAdd8Sx8; goto binnish;
+         case Iop_QAdd16Sx4:
+            fn = (HWord)h_generic_calc_QAdd16Sx4; goto binnish;
+         case Iop_QAdd8Ux8:
+            fn = (HWord)h_generic_calc_QAdd8Ux8; goto binnish;
+         case Iop_QAdd16Ux4:
+            fn = (HWord)h_generic_calc_QAdd16Ux4; goto binnish;
+
+         case Iop_QNarrow32Sx2:
+            fn = (HWord)h_generic_calc_QNarrow32Sx2; goto binnish;
+         case Iop_QNarrow16Sx4:
+            fn = (HWord)h_generic_calc_QNarrow16Sx4; goto binnish;
+         case Iop_QNarrow16Ux4:
+            fn = (HWord)h_generic_calc_QNarrow16Ux4; goto binnish;
+
+         case Iop_QSub8Sx8:
+            fn = (HWord)h_generic_calc_QSub8Sx8; goto binnish;
+         case Iop_QSub16Sx4:
+            fn = (HWord)h_generic_calc_QSub16Sx4; goto binnish;
+         case Iop_QSub8Ux8:
+            fn = (HWord)h_generic_calc_QSub8Ux8; goto binnish;
+         case Iop_QSub16Ux4:
+            fn = (HWord)h_generic_calc_QSub16Ux4; goto binnish;
+
+         case Iop_Sub8x8:
+            fn = (HWord)h_generic_calc_Sub8x8; goto binnish;
+         case Iop_Sub16x4:
+            fn = (HWord)h_generic_calc_Sub16x4; goto binnish;
+         case Iop_Sub32x2:
+            fn = (HWord)h_generic_calc_Sub32x2; goto binnish;
+
+         binnish: {
+            /* Note: the following assumes all helpers are of
+               signature 
+                  ULong fn ( ULong, ULong ), and they are
+               not marked as regparm functions. 
+            */
+            HReg xLo, xHi, yLo, yHi;
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            iselInt64Expr(&yHi, &yLo, env, e->Iex.Binop.arg2);
+            addInstr(env, X86Instr_Push(X86RMI_Reg(yHi)));
+            addInstr(env, X86Instr_Push(X86RMI_Reg(yLo)));
+            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
+            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
+            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
+            add_to_esp(env, 4*4);
+            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
+            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         case Iop_ShlN32x2:
+            fn = (HWord)h_generic_calc_ShlN32x2; goto shifty;
+         case Iop_ShlN16x4:
+            fn = (HWord)h_generic_calc_ShlN16x4; goto shifty;
+         case Iop_ShlN8x8:
+            fn = (HWord)h_generic_calc_ShlN8x8;  goto shifty;
+         case Iop_ShrN32x2:
+            fn = (HWord)h_generic_calc_ShrN32x2; goto shifty;
+         case Iop_ShrN16x4:
+            fn = (HWord)h_generic_calc_ShrN16x4; goto shifty;
+         case Iop_SarN32x2:
+            fn = (HWord)h_generic_calc_SarN32x2; goto shifty;
+         case Iop_SarN16x4:
+            fn = (HWord)h_generic_calc_SarN16x4; goto shifty;
+         case Iop_SarN8x8:
+            fn = (HWord)h_generic_calc_SarN8x8;  goto shifty;
+         shifty: {
+            /* Note: the following assumes all helpers are of
+               signature 
+                  ULong fn ( ULong, UInt ), and they are
+               not marked as regparm functions. 
+            */
+            HReg xLo, xHi;
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            X86RMI* y = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+            addInstr(env, X86Instr_Push(y));
+            iselInt64Expr(&xHi, &xLo, env, e->Iex.Binop.arg1);
+            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
+            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
+            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
+            add_to_esp(env, 3*4);
+            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
+            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         default: 
+            break;
+      }
+   } /* if (e->tag == Iex_Binop) */
+
+
+   /* --------- UNARY ops --------- */
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+
+         /* 32Sto64(e) */
+         case Iop_32Sto64: {
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, mk_iMOVsd_RR(src,tHi));
+            addInstr(env, mk_iMOVsd_RR(src,tLo));
+            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tHi));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* 32Uto64(e) */
+         case Iop_32Uto64: {
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, mk_iMOVsd_RR(src,tLo));
+            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* 16Uto64(e) */
+         case Iop_16Uto64: {
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            HReg src = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, mk_iMOVsd_RR(src,tLo));
+            addInstr(env, X86Instr_Alu32R(Xalu_AND,
+                                          X86RMI_Imm(0xFFFF), tLo));
+            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* V128{HI}to64 */
+         case Iop_V128HIto64:
+         case Iop_V128to64: {
+            Int  off = e->Iex.Unop.op==Iop_V128HIto64 ? 8 : 0;
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            HReg vec = iselVecExpr(env, e->Iex.Unop.arg);
+            X86AMode* esp0  = X86AMode_IR(0,     hregX86_ESP());
+            X86AMode* espLO = X86AMode_IR(off,   hregX86_ESP());
+            X86AMode* espHI = X86AMode_IR(off+4, hregX86_ESP());
+            sub_from_esp(env, 16);
+            addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, esp0));
+            addInstr(env, X86Instr_Alu32R( Xalu_MOV, 
+                                           X86RMI_Mem(espLO), tLo ));
+            addInstr(env, X86Instr_Alu32R( Xalu_MOV, 
+                                           X86RMI_Mem(espHI), tHi ));
+            add_to_esp(env, 16);
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* could do better than this, but for now ... */
+         case Iop_1Sto64: {
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            X86CondCode cond = iselCondCode(env, e->Iex.Unop.arg);
+            addInstr(env, X86Instr_Set32(cond,tLo));
+            addInstr(env, X86Instr_Sh32(Xsh_SHL, 31, tLo));
+            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tLo));
+            addInstr(env, mk_iMOVsd_RR(tLo, tHi));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* Not64(e) */
+         case Iop_Not64: {
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            HReg sHi, sLo;
+            iselInt64Expr(&sHi, &sLo, env, e->Iex.Unop.arg);
+            addInstr(env, mk_iMOVsd_RR(sHi, tHi));
+            addInstr(env, mk_iMOVsd_RR(sLo, tLo));
+            addInstr(env, X86Instr_Unary32(Xun_NOT,tHi));
+            addInstr(env, X86Instr_Unary32(Xun_NOT,tLo));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* Left64(e) */
+         case Iop_Left64: {
+            HReg yLo, yHi;
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            /* yHi:yLo = arg */
+            iselInt64Expr(&yHi, &yLo, env, e->Iex.Unop.arg);
+            /* tLo = 0 - yLo, and set carry */
+            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tLo));
+            addInstr(env, X86Instr_Alu32R(Xalu_SUB, X86RMI_Reg(yLo), tLo));
+            /* tHi = 0 - yHi - carry */
+            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Imm(0), tHi));
+            addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(yHi), tHi));
+            /* So now we have tHi:tLo = -arg.  To finish off, or 'arg'
+               back in, so as to give the final result 
+               tHi:tLo = arg | -arg. */
+            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yLo), tLo));
+            addInstr(env, X86Instr_Alu32R(Xalu_OR, X86RMI_Reg(yHi), tHi));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         /* --- patterns rooted at: CmpwNEZ64 --- */
+
+         /* CmpwNEZ64(e) */
+         case Iop_CmpwNEZ64: {
+
+         DECLARE_PATTERN(p_CmpwNEZ64_Or64);
+         DEFINE_PATTERN(p_CmpwNEZ64_Or64,
+                        unop(Iop_CmpwNEZ64,binop(Iop_Or64,bind(0),bind(1))));
+         if (matchIRExpr(&mi, p_CmpwNEZ64_Or64, e)) {
+            /* CmpwNEZ64(Or64(x,y)) */
+            HReg xHi,xLo,yHi,yLo;
+            HReg xBoth = newVRegI(env);
+            HReg merged = newVRegI(env);
+            HReg tmp2 = newVRegI(env);
+
+            iselInt64Expr(&xHi,&xLo, env, mi.bindee[0]);
+            addInstr(env, mk_iMOVsd_RR(xHi,xBoth));
+            addInstr(env, X86Instr_Alu32R(Xalu_OR,
+                                          X86RMI_Reg(xLo),xBoth));
+
+            iselInt64Expr(&yHi,&yLo, env, mi.bindee[1]);
+            addInstr(env, mk_iMOVsd_RR(yHi,merged));
+            addInstr(env, X86Instr_Alu32R(Xalu_OR,
+                                          X86RMI_Reg(yLo),merged));
+            addInstr(env, X86Instr_Alu32R(Xalu_OR,
+                                             X86RMI_Reg(xBoth),merged));
+
+            /* tmp2 = (merged | -merged) >>s 31 */
+            addInstr(env, mk_iMOVsd_RR(merged,tmp2));
+            addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
+            addInstr(env, X86Instr_Alu32R(Xalu_OR,
+                                          X86RMI_Reg(merged), tmp2));
+            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
+            *rHi = tmp2;
+            *rLo = tmp2;
+            return;
+         } else {
+            /* CmpwNEZ64(e) */
+            HReg srcLo, srcHi;
+            HReg tmp1  = newVRegI(env);
+            HReg tmp2  = newVRegI(env);
+            /* srcHi:srcLo = arg */
+            iselInt64Expr(&srcHi, &srcLo, env, e->Iex.Unop.arg);
+            /* tmp1 = srcHi | srcLo */
+            addInstr(env, mk_iMOVsd_RR(srcHi,tmp1));
+            addInstr(env, X86Instr_Alu32R(Xalu_OR,
+                                          X86RMI_Reg(srcLo), tmp1));
+            /* tmp2 = (tmp1 | -tmp1) >>s 31 */
+            addInstr(env, mk_iMOVsd_RR(tmp1,tmp2));
+            addInstr(env, X86Instr_Unary32(Xun_NEG,tmp2));
+            addInstr(env, X86Instr_Alu32R(Xalu_OR,
+                                          X86RMI_Reg(tmp1), tmp2));
+            addInstr(env, X86Instr_Sh32(Xsh_SAR, 31, tmp2));
+            *rHi = tmp2;
+            *rLo = tmp2;
+            return;
+         }
+         }
+
+         /* ReinterpF64asI64(e) */
+         /* Given an IEEE754 double, produce an I64 with the same bit
+            pattern. */
+         case Iop_ReinterpF64asI64: {
+            HReg rf   = iselDblExpr(env, e->Iex.Unop.arg);
+            HReg tLo  = newVRegI(env);
+            HReg tHi  = newVRegI(env);
+            X86AMode* zero_esp = X86AMode_IR(0, hregX86_ESP());
+            X86AMode* four_esp = X86AMode_IR(4, hregX86_ESP());
+            /* paranoia */
+            set_FPU_rounding_default(env);
+            /* subl $8, %esp */
+            sub_from_esp(env, 8);
+            /* gstD %rf, 0(%esp) */
+            addInstr(env,
+                     X86Instr_FpLdSt(False/*store*/, 8, rf, zero_esp));
+            /* movl 0(%esp), %tLo */
+            addInstr(env, 
+                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(zero_esp), tLo));
+            /* movl 4(%esp), %tHi */
+            addInstr(env, 
+                     X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(four_esp), tHi));
+            /* addl $8, %esp */
+            add_to_esp(env, 8);
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         case Iop_CmpNEZ32x2:
+            fn = (HWord)h_generic_calc_CmpNEZ32x2; goto unish;
+         case Iop_CmpNEZ16x4:
+            fn = (HWord)h_generic_calc_CmpNEZ16x4; goto unish;
+         case Iop_CmpNEZ8x8:
+            fn = (HWord)h_generic_calc_CmpNEZ8x8; goto unish;
+         unish: {
+            /* Note: the following assumes all helpers are of
+               signature 
+                  ULong fn ( ULong ), and they are
+               not marked as regparm functions. 
+            */
+            HReg xLo, xHi;
+            HReg tLo = newVRegI(env);
+            HReg tHi = newVRegI(env);
+            iselInt64Expr(&xHi, &xLo, env, e->Iex.Unop.arg);
+            addInstr(env, X86Instr_Push(X86RMI_Reg(xHi)));
+            addInstr(env, X86Instr_Push(X86RMI_Reg(xLo)));
+            addInstr(env, X86Instr_Call( Xcc_ALWAYS, (UInt)fn, 0 ));
+            add_to_esp(env, 2*4);
+            addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
+            addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
+            *rHi = tHi;
+            *rLo = tLo;
+            return;
+         }
+
+         default: 
+            break;
+      }
+   } /* if (e->tag == Iex_Unop) */
+
+
+   /* --------- CCALL --------- */
+   if (e->tag == Iex_CCall) {
+      HReg tLo = newVRegI(env);
+      HReg tHi = newVRegI(env);
+
+      /* Marshal args, do the call, clear stack. */
+      doHelperCall( env, False, NULL, e->Iex.CCall.cee, e->Iex.CCall.args );
+
+      addInstr(env, mk_iMOVsd_RR(hregX86_EDX(), tHi));
+      addInstr(env, mk_iMOVsd_RR(hregX86_EAX(), tLo));
+      *rHi = tHi;
+      *rLo = tLo;
+      return;
+   }
+
+   ppIRExpr(e);
+   vpanic("iselInt64Expr");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Floating point expressions (32 bit)         ---*/
+/*---------------------------------------------------------*/
+
+/* Nothing interesting here; really just wrappers for
+   64-bit stuff. */
+
+static HReg iselFltExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselFltExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcFlt64); /* yes, really Flt64 */
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselFltExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(ty == Ity_F32);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+      X86AMode* am;
+      HReg res = newVRegF(env);
+      vassert(e->Iex.Load.ty == Ity_F32);
+      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
+      addInstr(env, X86Instr_FpLdSt(True/*load*/, 4, res, am));
+      return res;
+   }
+
+   if (e->tag == Iex_Binop
+       && e->Iex.Binop.op == Iop_F64toF32) {
+      /* Although the result is still held in a standard FPU register,
+         we need to round it to reflect the loss of accuracy/range
+         entailed in casting it to a 32-bit float. */
+      HReg dst = newVRegF(env);
+      HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
+      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+      addInstr(env, X86Instr_Fp64to32(src,dst));
+      set_FPU_rounding_default( env );
+      return dst;
+   }
+
+   if (e->tag == Iex_Get) {
+      X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
+                                  hregX86_EBP() );
+      HReg res = newVRegF(env);
+      addInstr(env, X86Instr_FpLdSt( True/*load*/, 4, res, am ));
+      return res;
+   }
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.op == Iop_ReinterpI32asF32) {
+       /* Given an I32, produce an IEEE754 float with the same bit
+          pattern. */
+      HReg    dst = newVRegF(env);
+      X86RMI* rmi = iselIntExpr_RMI(env, e->Iex.Unop.arg);
+      /* paranoia */
+      addInstr(env, X86Instr_Push(rmi));
+      addInstr(env, X86Instr_FpLdSt(
+                       True/*load*/, 4, dst, 
+                       X86AMode_IR(0, hregX86_ESP())));
+      add_to_esp(env, 4);
+      return dst;
+   }
+
+   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF32toInt) {
+      HReg rf  = iselFltExpr(env, e->Iex.Binop.arg2);
+      HReg dst = newVRegF(env);
+
+      /* rf now holds the value to be rounded.  The first thing to do
+         is set the FPU's rounding mode accordingly. */
+
+      /* Set host rounding mode */
+      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+
+      /* grndint %rf, %dst */
+      addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
+
+      /* Restore default FPU rounding. */
+      set_FPU_rounding_default( env );
+
+      return dst;
+   }
+
+   ppIRExpr(e);
+   vpanic("iselFltExpr_wrk");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Floating point expressions (64 bit)         ---*/
+/*---------------------------------------------------------*/
+
+/* Compute a 64-bit floating point value into a register, the identity
+   of which is returned.  As with iselIntExpr_R, the reg may be either
+   real or virtual; in any case it must not be changed by subsequent
+   code emitted by the caller.  */
+
+/* IEEE 754 formats.  From http://www.freesoft.org/CIE/RFC/1832/32.htm:
+
+    Type                  S (1 bit)   E (11 bits)   F (52 bits)
+    ----                  ---------   -----------   -----------
+    signalling NaN        u           2047 (max)    .0uuuuu---u
+                                                    (with at least
+                                                     one 1 bit)
+    quiet NaN             u           2047 (max)    .1uuuuu---u
+
+    negative infinity     1           2047 (max)    .000000---0
+
+    positive infinity     0           2047 (max)    .000000---0
+
+    negative zero         1           0             .000000---0
+
+    positive zero         0           0             .000000---0
+*/
+
+static HReg iselDblExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselDblExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcFlt64);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselDblExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+   IRType ty = typeOfIRExpr(env->type_env,e);
+   vassert(e);
+   vassert(ty == Ity_F64);
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Const) {
+      union { UInt u32x2[2]; ULong u64; Double f64; } u;
+      HReg freg = newVRegF(env);
+      vassert(sizeof(u) == 8);
+      vassert(sizeof(u.u64) == 8);
+      vassert(sizeof(u.f64) == 8);
+      vassert(sizeof(u.u32x2) == 8);
+
+      if (e->Iex.Const.con->tag == Ico_F64) {
+         u.f64 = e->Iex.Const.con->Ico.F64;
+      }
+      else if (e->Iex.Const.con->tag == Ico_F64i) {
+         u.u64 = e->Iex.Const.con->Ico.F64i;
+      }
+      else
+         vpanic("iselDblExpr(x86): const");
+
+      addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[1])));
+      addInstr(env, X86Instr_Push(X86RMI_Imm(u.u32x2[0])));
+      addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, freg, 
+                                    X86AMode_IR(0, hregX86_ESP())));
+      add_to_esp(env, 8);
+      return freg;
+   }
+
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+      X86AMode* am;
+      HReg res = newVRegF(env);
+      vassert(e->Iex.Load.ty == Ity_F64);
+      am = iselIntExpr_AMode(env, e->Iex.Load.addr);
+      addInstr(env, X86Instr_FpLdSt(True/*load*/, 8, res, am));
+      return res;
+   }
+
+   if (e->tag == Iex_Get) {
+      X86AMode* am = X86AMode_IR( e->Iex.Get.offset,
+                                  hregX86_EBP() );
+      HReg res = newVRegF(env);
+      addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
+      return res;
+   }
+
+   if (e->tag == Iex_GetI) {
+      X86AMode* am 
+         = genGuestArrayOffset(
+              env, e->Iex.GetI.descr, 
+                   e->Iex.GetI.ix, e->Iex.GetI.bias );
+      HReg res = newVRegF(env);
+      addInstr(env, X86Instr_FpLdSt( True/*load*/, 8, res, am ));
+      return res;
+   }
+
+   if (e->tag == Iex_Triop) {
+      X86FpOp fpop = Xfp_INVALID;
+      switch (e->Iex.Triop.op) {
+         case Iop_AddF64:    fpop = Xfp_ADD; break;
+         case Iop_SubF64:    fpop = Xfp_SUB; break;
+         case Iop_MulF64:    fpop = Xfp_MUL; break;
+         case Iop_DivF64:    fpop = Xfp_DIV; break;
+         case Iop_ScaleF64:  fpop = Xfp_SCALE; break;
+         case Iop_Yl2xF64:   fpop = Xfp_YL2X; break;
+         case Iop_Yl2xp1F64: fpop = Xfp_YL2XP1; break;
+         case Iop_AtanF64:   fpop = Xfp_ATAN; break;
+         case Iop_PRemF64:   fpop = Xfp_PREM; break;
+         case Iop_PRem1F64:  fpop = Xfp_PREM1; break;
+         default: break;
+      }
+      if (fpop != Xfp_INVALID) {
+         HReg res  = newVRegF(env);
+         HReg srcL = iselDblExpr(env, e->Iex.Triop.arg2);
+         HReg srcR = iselDblExpr(env, e->Iex.Triop.arg3);
+         /* XXXROUNDINGFIXME */
+         /* set roundingmode here */
+         addInstr(env, X86Instr_FpBinary(fpop,srcL,srcR,res));
+	 if (fpop != Xfp_ADD && fpop != Xfp_SUB 
+	     && fpop != Xfp_MUL && fpop != Xfp_DIV)
+            roundToF64(env, res);
+         return res;
+      }
+   }
+
+   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_RoundF64toInt) {
+      HReg rf  = iselDblExpr(env, e->Iex.Binop.arg2);
+      HReg dst = newVRegF(env);
+
+      /* rf now holds the value to be rounded.  The first thing to do
+         is set the FPU's rounding mode accordingly. */
+
+      /* Set host rounding mode */
+      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+
+      /* grndint %rf, %dst */
+      addInstr(env, X86Instr_FpUnary(Xfp_ROUND, rf, dst));
+
+      /* Restore default FPU rounding. */
+      set_FPU_rounding_default( env );
+
+      return dst;
+   }
+
+   if (e->tag == Iex_Binop && e->Iex.Binop.op == Iop_I64StoF64) {
+      HReg dst = newVRegF(env);
+      HReg rHi,rLo;
+      iselInt64Expr( &rHi, &rLo, env, e->Iex.Binop.arg2);
+      addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
+      addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
+
+      /* Set host rounding mode */
+      set_FPU_rounding_mode( env, e->Iex.Binop.arg1 );
+
+      addInstr(env, X86Instr_FpLdStI(
+                       True/*load*/, 8, dst, 
+                       X86AMode_IR(0, hregX86_ESP())));
+
+      /* Restore default FPU rounding. */
+      set_FPU_rounding_default( env );
+
+      add_to_esp(env, 8);
+      return dst;
+   }
+
+   if (e->tag == Iex_Binop) {
+      X86FpOp fpop = Xfp_INVALID;
+      switch (e->Iex.Binop.op) {
+         case Iop_SinF64:  fpop = Xfp_SIN; break;
+         case Iop_CosF64:  fpop = Xfp_COS; break;
+         case Iop_TanF64:  fpop = Xfp_TAN; break;
+         case Iop_2xm1F64: fpop = Xfp_2XM1; break;
+         case Iop_SqrtF64: fpop = Xfp_SQRT; break;
+         default: break;
+      }
+      if (fpop != Xfp_INVALID) {
+         HReg res = newVRegF(env);
+         HReg src = iselDblExpr(env, e->Iex.Binop.arg2);
+         /* XXXROUNDINGFIXME */
+         /* set roundingmode here */
+         addInstr(env, X86Instr_FpUnary(fpop,src,res));
+	 if (fpop != Xfp_SQRT
+             && fpop != Xfp_NEG && fpop != Xfp_ABS)
+            roundToF64(env, res);
+         return res;
+      }
+   }
+
+   if (e->tag == Iex_Unop) {
+      X86FpOp fpop = Xfp_INVALID;
+      switch (e->Iex.Unop.op) {
+         case Iop_NegF64:  fpop = Xfp_NEG; break;
+         case Iop_AbsF64:  fpop = Xfp_ABS; break;
+         default: break;
+      }
+      if (fpop != Xfp_INVALID) {
+         HReg res = newVRegF(env);
+         HReg src = iselDblExpr(env, e->Iex.Unop.arg);
+         addInstr(env, X86Instr_FpUnary(fpop,src,res));
+	 if (fpop != Xfp_NEG && fpop != Xfp_ABS)
+            roundToF64(env, res);
+         return res;
+      }
+   }
+
+   if (e->tag == Iex_Unop) {
+      switch (e->Iex.Unop.op) {
+         case Iop_I32StoF64: {
+            HReg dst = newVRegF(env);
+            HReg ri  = iselIntExpr_R(env, e->Iex.Unop.arg);
+            addInstr(env, X86Instr_Push(X86RMI_Reg(ri)));
+            set_FPU_rounding_default(env);
+            addInstr(env, X86Instr_FpLdStI(
+                             True/*load*/, 4, dst, 
+                             X86AMode_IR(0, hregX86_ESP())));
+	    add_to_esp(env, 4);
+            return dst;
+         }
+         case Iop_ReinterpI64asF64: {
+            /* Given an I64, produce an IEEE754 double with the same
+               bit pattern. */
+            HReg dst = newVRegF(env);
+            HReg rHi, rLo;
+	    iselInt64Expr( &rHi, &rLo, env, e->Iex.Unop.arg);
+            /* paranoia */
+            set_FPU_rounding_default(env);
+            addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
+            addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
+            addInstr(env, X86Instr_FpLdSt(
+                             True/*load*/, 8, dst, 
+                             X86AMode_IR(0, hregX86_ESP())));
+	    add_to_esp(env, 8);
+            return dst;
+	 }
+         case Iop_F32toF64: {
+            /* this is a no-op */
+            HReg res = iselFltExpr(env, e->Iex.Unop.arg);
+            return res;
+	 }
+         default: 
+            break;
+      }
+   }
+
+   /* --------- MULTIPLEX --------- */
+   if (e->tag == Iex_Mux0X) {
+     if (ty == Ity_F64
+         && typeOfIRExpr(env->type_env,e->Iex.Mux0X.cond) == Ity_I8) {
+        X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
+        HReg rX  = iselDblExpr(env, e->Iex.Mux0X.exprX);
+        HReg r0  = iselDblExpr(env, e->Iex.Mux0X.expr0);
+        HReg dst = newVRegF(env);
+        addInstr(env, X86Instr_FpUnary(Xfp_MOV,rX,dst));
+        addInstr(env, X86Instr_Test32(0xFF, r8));
+        addInstr(env, X86Instr_FpCMov(Xcc_Z,r0,dst));
+        return dst;
+      }
+   }
+
+   ppIRExpr(e);
+   vpanic("iselDblExpr_wrk");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: SIMD (Vector) expressions, 128 bit.         ---*/
+/*---------------------------------------------------------*/
+
+static HReg iselVecExpr ( ISelEnv* env, IRExpr* e )
+{
+   HReg r = iselVecExpr_wrk( env, e );
+#  if 0
+   vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+#  endif
+   vassert(hregClass(r) == HRcVec128);
+   vassert(hregIsVirtual(r));
+   return r;
+}
+
+
+/* DO NOT CALL THIS DIRECTLY */
+static HReg iselVecExpr_wrk ( ISelEnv* env, IRExpr* e )
+{
+
+#  define REQUIRE_SSE1                                    \
+      do { if (env->hwcaps == 0/*baseline, no sse*/)      \
+              goto vec_fail;                              \
+      } while (0)
+
+#  define REQUIRE_SSE2                                    \
+      do { if (0 == (env->hwcaps & VEX_HWCAPS_X86_SSE2))  \
+              goto vec_fail;                              \
+      } while (0)
+
+#  define SSE2_OR_ABOVE                                   \
+       (env->hwcaps & VEX_HWCAPS_X86_SSE2)
+
+   MatchInfo mi;
+   Bool      arg1isEReg = False;
+   X86SseOp  op = Xsse_INVALID;
+   IRType    ty = typeOfIRExpr(env->type_env,e);
+   vassert(e);
+   vassert(ty == Ity_V128);
+
+   REQUIRE_SSE1;
+
+   if (e->tag == Iex_RdTmp) {
+      return lookupIRTemp(env, e->Iex.RdTmp.tmp);
+   }
+
+   if (e->tag == Iex_Get) {
+      HReg dst = newVRegV(env);
+      addInstr(env, X86Instr_SseLdSt(
+                       True/*load*/, 
+                       dst,
+                       X86AMode_IR(e->Iex.Get.offset, hregX86_EBP())
+                    )
+              );
+      return dst;
+   }
+
+   if (e->tag == Iex_Load && e->Iex.Load.end == Iend_LE) {
+      HReg      dst = newVRegV(env);
+      X86AMode* am  = iselIntExpr_AMode(env, e->Iex.Load.addr);
+      addInstr(env, X86Instr_SseLdSt( True/*load*/, dst, am ));
+      return dst;
+   }
+
+   if (e->tag == Iex_Const) {
+      HReg dst = newVRegV(env);
+      vassert(e->Iex.Const.con->tag == Ico_V128);
+      addInstr(env, X86Instr_SseConst(e->Iex.Const.con->Ico.V128, dst));
+      return dst;
+   }
+
+   if (e->tag == Iex_Unop) {
+
+   if (SSE2_OR_ABOVE) { 
+      /* 64UtoV128(LDle:I64(addr)) */
+      DECLARE_PATTERN(p_zwiden_load64);
+      DEFINE_PATTERN(p_zwiden_load64,
+                     unop(Iop_64UtoV128, 
+                          IRExpr_Load(Iend_LE,Ity_I64,bind(0))));
+      if (matchIRExpr(&mi, p_zwiden_load64, e)) {
+         X86AMode* am = iselIntExpr_AMode(env, mi.bindee[0]);
+         HReg dst = newVRegV(env);
+         addInstr(env, X86Instr_SseLdzLO(8, dst, am));
+         return dst;
+      }
+   }
+
+   switch (e->Iex.Unop.op) {
+
+      case Iop_NotV128: {
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         return do_sse_Not128(env, arg);
+      }
+
+      case Iop_CmpNEZ64x2: {
+         /* We can use SSE2 instructions for this. */
+         /* Ideally, we want to do a 64Ix2 comparison against zero of
+            the operand.  Problem is no such insn exists.  Solution
+            therefore is to do a 32Ix4 comparison instead, and bitwise-
+            negate (NOT) the result.  Let a,b,c,d be 32-bit lanes, and 
+            let the not'd result of this initial comparison be a:b:c:d.
+            What we need to compute is (a|b):(a|b):(c|d):(c|d).  So, use
+            pshufd to create a value b:a:d:c, and OR that with a:b:c:d,
+            giving the required result.
+
+            The required selection sequence is 2,3,0,1, which
+            according to Intel's documentation means the pshufd
+            literal value is 0xB1, that is, 
+            (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0) 
+         */
+         HReg arg  = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg tmp  = newVRegV(env);
+         HReg dst  = newVRegV(env);
+         REQUIRE_SSE2;
+         addInstr(env, X86Instr_SseReRg(Xsse_XOR, tmp, tmp));
+         addInstr(env, X86Instr_SseReRg(Xsse_CMPEQ32, arg, tmp));
+         tmp = do_sse_Not128(env, tmp);
+         addInstr(env, X86Instr_SseShuf(0xB1, tmp, dst));
+         addInstr(env, X86Instr_SseReRg(Xsse_OR, tmp, dst));
+         return dst;
+      }
+
+      case Iop_CmpNEZ32x4: {
+         /* Sigh, we have to generate lousy code since this has to
+            work on SSE1 hosts */
+         /* basically, the idea is: for each lane:
+               movl lane, %r ; negl %r   (now CF = lane==0 ? 0 : 1)
+               sbbl %r, %r               (now %r = 1Sto32(CF))
+               movl %r, lane
+         */
+         Int       i;
+         X86AMode* am;
+         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
+         HReg      arg  = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg      dst  = newVRegV(env);
+         HReg      r32  = newVRegI(env);
+         sub_from_esp(env, 16);
+         addInstr(env, X86Instr_SseLdSt(False/*store*/, arg, esp0));
+         for (i = 0; i < 4; i++) {
+            am = X86AMode_IR(i*4, hregX86_ESP());
+            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Mem(am), r32));
+            addInstr(env, X86Instr_Unary32(Xun_NEG, r32));
+            addInstr(env, X86Instr_Alu32R(Xalu_SBB, X86RMI_Reg(r32), r32));
+            addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r32), am));
+         }
+         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
+         add_to_esp(env, 16);
+         return dst;
+      }
+
+      case Iop_CmpNEZ8x16:
+      case Iop_CmpNEZ16x8: {
+         /* We can use SSE2 instructions for this. */
+         HReg arg;
+         HReg vec0 = newVRegV(env);
+         HReg vec1 = newVRegV(env);
+         HReg dst  = newVRegV(env);
+         X86SseOp cmpOp 
+            = e->Iex.Unop.op==Iop_CmpNEZ16x8 ? Xsse_CMPEQ16
+                                             : Xsse_CMPEQ8;
+         REQUIRE_SSE2;
+         addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec0, vec0));
+         addInstr(env, mk_vMOVsd_RR(vec0, vec1));
+         addInstr(env, X86Instr_Sse32Fx4(Xsse_CMPEQF, vec1, vec1));
+         /* defer arg computation to here so as to give CMPEQF as long
+            as possible to complete */
+         arg = iselVecExpr(env, e->Iex.Unop.arg);
+         /* vec0 is all 0s; vec1 is all 1s */
+         addInstr(env, mk_vMOVsd_RR(arg, dst));
+         /* 16x8 or 8x16 comparison == */
+         addInstr(env, X86Instr_SseReRg(cmpOp, vec0, dst));
+         /* invert result */
+         addInstr(env, X86Instr_SseReRg(Xsse_XOR, vec1, dst));
+         return dst;
+      }
+
+      case Iop_Recip32Fx4: op = Xsse_RCPF;   goto do_32Fx4_unary;
+      case Iop_RSqrt32Fx4: op = Xsse_RSQRTF; goto do_32Fx4_unary;
+      case Iop_Sqrt32Fx4:  op = Xsse_SQRTF;  goto do_32Fx4_unary;
+      do_32Fx4_unary:
+      {
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg dst = newVRegV(env);
+         addInstr(env, X86Instr_Sse32Fx4(op, arg, dst));
+         return dst;
+      }
+
+      case Iop_Recip64Fx2: op = Xsse_RCPF;   goto do_64Fx2_unary;
+      case Iop_RSqrt64Fx2: op = Xsse_RSQRTF; goto do_64Fx2_unary;
+      case Iop_Sqrt64Fx2:  op = Xsse_SQRTF;  goto do_64Fx2_unary;
+      do_64Fx2_unary:
+      {
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg dst = newVRegV(env);
+         REQUIRE_SSE2;
+         addInstr(env, X86Instr_Sse64Fx2(op, arg, dst));
+         return dst;
+      }
+
+      case Iop_Recip32F0x4: op = Xsse_RCPF;   goto do_32F0x4_unary;
+      case Iop_RSqrt32F0x4: op = Xsse_RSQRTF; goto do_32F0x4_unary;
+      case Iop_Sqrt32F0x4:  op = Xsse_SQRTF;  goto do_32F0x4_unary;
+      do_32F0x4_unary:
+      {
+         /* A bit subtle.  We have to copy the arg to the result
+            register first, because actually doing the SSE scalar insn
+            leaves the upper 3/4 of the destination register
+            unchanged.  Whereas the required semantics of these
+            primops is that the upper 3/4 is simply copied in from the
+            argument. */
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(arg, dst));
+         addInstr(env, X86Instr_Sse32FLo(op, arg, dst));
+         return dst;
+      }
+
+      case Iop_Recip64F0x2: op = Xsse_RCPF;   goto do_64F0x2_unary;
+      case Iop_RSqrt64F0x2: op = Xsse_RSQRTF; goto do_64F0x2_unary;
+      case Iop_Sqrt64F0x2:  op = Xsse_SQRTF;  goto do_64F0x2_unary;
+      do_64F0x2_unary:
+      {
+         /* A bit subtle.  We have to copy the arg to the result
+            register first, because actually doing the SSE scalar insn
+            leaves the upper half of the destination register
+            unchanged.  Whereas the required semantics of these
+            primops is that the upper half is simply copied in from the
+            argument. */
+         HReg arg = iselVecExpr(env, e->Iex.Unop.arg);
+         HReg dst = newVRegV(env);
+         REQUIRE_SSE2;
+         addInstr(env, mk_vMOVsd_RR(arg, dst));
+         addInstr(env, X86Instr_Sse64FLo(op, arg, dst));
+         return dst;
+      }
+
+      case Iop_32UtoV128: {
+         HReg      dst  = newVRegV(env);
+         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
+         X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Unop.arg);
+         addInstr(env, X86Instr_Push(rmi));
+	 addInstr(env, X86Instr_SseLdzLO(4, dst, esp0));
+         add_to_esp(env, 4);
+         return dst;
+      }
+
+      case Iop_64UtoV128: {
+         HReg      rHi, rLo;
+         HReg      dst  = newVRegV(env);
+         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
+         iselInt64Expr(&rHi, &rLo, env, e->Iex.Unop.arg);
+         addInstr(env, X86Instr_Push(X86RMI_Reg(rHi)));
+         addInstr(env, X86Instr_Push(X86RMI_Reg(rLo)));
+	 addInstr(env, X86Instr_SseLdzLO(8, dst, esp0));
+         add_to_esp(env, 8);
+         return dst;
+      }
+
+      default:
+         break;
+   } /* switch (e->Iex.Unop.op) */
+   } /* if (e->tag == Iex_Unop) */
+
+   if (e->tag == Iex_Binop) {
+   switch (e->Iex.Binop.op) {
+
+      case Iop_SetV128lo32: {
+         HReg dst = newVRegV(env);
+         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg srcI = iselIntExpr_R(env, e->Iex.Binop.arg2);
+         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
+         sub_from_esp(env, 16);
+         addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
+         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcI), esp0));
+         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
+         add_to_esp(env, 16);
+         return dst;
+      }
+
+      case Iop_SetV128lo64: {
+         HReg dst = newVRegV(env);
+         HReg srcV = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg srcIhi, srcIlo;
+         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
+         X86AMode* esp4 = advance4(esp0);
+         iselInt64Expr(&srcIhi, &srcIlo, env, e->Iex.Binop.arg2);
+         sub_from_esp(env, 16);
+         addInstr(env, X86Instr_SseLdSt(False/*store*/, srcV, esp0));
+         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIlo), esp0));
+         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(srcIhi), esp4));
+         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
+         add_to_esp(env, 16);
+         return dst;
+      }
+
+      case Iop_64HLtoV128: {
+         HReg r3, r2, r1, r0;
+         X86AMode* esp0  = X86AMode_IR(0, hregX86_ESP());
+         X86AMode* esp4  = advance4(esp0);
+         X86AMode* esp8  = advance4(esp4);
+         X86AMode* esp12 = advance4(esp8);
+         HReg dst = newVRegV(env);
+	 /* do this via the stack (easy, convenient, etc) */
+         sub_from_esp(env, 16);
+         /* Do the less significant 64 bits */
+         iselInt64Expr(&r1, &r0, env, e->Iex.Binop.arg2);
+         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r0), esp0));
+         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r1), esp4));
+         /* Do the more significant 64 bits */
+         iselInt64Expr(&r3, &r2, env, e->Iex.Binop.arg1);
+         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r2), esp8));
+         addInstr(env, X86Instr_Alu32M(Xalu_MOV, X86RI_Reg(r3), esp12));
+	 /* Fetch result back from stack. */
+         addInstr(env, X86Instr_SseLdSt(True/*load*/, dst, esp0));
+         add_to_esp(env, 16);
+         return dst;
+      }
+
+      case Iop_CmpEQ32Fx4: op = Xsse_CMPEQF; goto do_32Fx4;
+      case Iop_CmpLT32Fx4: op = Xsse_CMPLTF; goto do_32Fx4;
+      case Iop_CmpLE32Fx4: op = Xsse_CMPLEF; goto do_32Fx4;
+      case Iop_CmpUN32Fx4: op = Xsse_CMPUNF; goto do_32Fx4;
+      case Iop_Add32Fx4:   op = Xsse_ADDF;   goto do_32Fx4;
+      case Iop_Div32Fx4:   op = Xsse_DIVF;   goto do_32Fx4;
+      case Iop_Max32Fx4:   op = Xsse_MAXF;   goto do_32Fx4;
+      case Iop_Min32Fx4:   op = Xsse_MINF;   goto do_32Fx4;
+      case Iop_Mul32Fx4:   op = Xsse_MULF;   goto do_32Fx4;
+      case Iop_Sub32Fx4:   op = Xsse_SUBF;   goto do_32Fx4;
+      do_32Fx4:
+      {
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         addInstr(env, X86Instr_Sse32Fx4(op, argR, dst));
+         return dst;
+      }
+
+      case Iop_CmpEQ64Fx2: op = Xsse_CMPEQF; goto do_64Fx2;
+      case Iop_CmpLT64Fx2: op = Xsse_CMPLTF; goto do_64Fx2;
+      case Iop_CmpLE64Fx2: op = Xsse_CMPLEF; goto do_64Fx2;
+      case Iop_CmpUN64Fx2: op = Xsse_CMPUNF; goto do_64Fx2;
+      case Iop_Add64Fx2:   op = Xsse_ADDF;   goto do_64Fx2;
+      case Iop_Div64Fx2:   op = Xsse_DIVF;   goto do_64Fx2;
+      case Iop_Max64Fx2:   op = Xsse_MAXF;   goto do_64Fx2;
+      case Iop_Min64Fx2:   op = Xsse_MINF;   goto do_64Fx2;
+      case Iop_Mul64Fx2:   op = Xsse_MULF;   goto do_64Fx2;
+      case Iop_Sub64Fx2:   op = Xsse_SUBF;   goto do_64Fx2;
+      do_64Fx2:
+      {
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         REQUIRE_SSE2;
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         addInstr(env, X86Instr_Sse64Fx2(op, argR, dst));
+         return dst;
+      }
+
+      case Iop_CmpEQ32F0x4: op = Xsse_CMPEQF; goto do_32F0x4;
+      case Iop_CmpLT32F0x4: op = Xsse_CMPLTF; goto do_32F0x4;
+      case Iop_CmpLE32F0x4: op = Xsse_CMPLEF; goto do_32F0x4;
+      case Iop_CmpUN32F0x4: op = Xsse_CMPUNF; goto do_32F0x4;
+      case Iop_Add32F0x4:   op = Xsse_ADDF;   goto do_32F0x4;
+      case Iop_Div32F0x4:   op = Xsse_DIVF;   goto do_32F0x4;
+      case Iop_Max32F0x4:   op = Xsse_MAXF;   goto do_32F0x4;
+      case Iop_Min32F0x4:   op = Xsse_MINF;   goto do_32F0x4;
+      case Iop_Mul32F0x4:   op = Xsse_MULF;   goto do_32F0x4;
+      case Iop_Sub32F0x4:   op = Xsse_SUBF;   goto do_32F0x4;
+      do_32F0x4: {
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         addInstr(env, X86Instr_Sse32FLo(op, argR, dst));
+         return dst;
+      }
+
+      case Iop_CmpEQ64F0x2: op = Xsse_CMPEQF; goto do_64F0x2;
+      case Iop_CmpLT64F0x2: op = Xsse_CMPLTF; goto do_64F0x2;
+      case Iop_CmpLE64F0x2: op = Xsse_CMPLEF; goto do_64F0x2;
+      case Iop_CmpUN64F0x2: op = Xsse_CMPUNF; goto do_64F0x2;
+      case Iop_Add64F0x2:   op = Xsse_ADDF;   goto do_64F0x2;
+      case Iop_Div64F0x2:   op = Xsse_DIVF;   goto do_64F0x2;
+      case Iop_Max64F0x2:   op = Xsse_MAXF;   goto do_64F0x2;
+      case Iop_Min64F0x2:   op = Xsse_MINF;   goto do_64F0x2;
+      case Iop_Mul64F0x2:   op = Xsse_MULF;   goto do_64F0x2;
+      case Iop_Sub64F0x2:   op = Xsse_SUBF;   goto do_64F0x2;
+      do_64F0x2: {
+         HReg argL = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg argR = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         REQUIRE_SSE2;
+         addInstr(env, mk_vMOVsd_RR(argL, dst));
+         addInstr(env, X86Instr_Sse64FLo(op, argR, dst));
+         return dst;
+      }
+
+      case Iop_QNarrow32Sx4: 
+         op = Xsse_PACKSSD; arg1isEReg = True; goto do_SseReRg;
+      case Iop_QNarrow16Sx8: 
+         op = Xsse_PACKSSW; arg1isEReg = True; goto do_SseReRg;
+      case Iop_QNarrow16Ux8: 
+         op = Xsse_PACKUSW; arg1isEReg = True; goto do_SseReRg;
+
+      case Iop_InterleaveHI8x16: 
+         op = Xsse_UNPCKHB; arg1isEReg = True; goto do_SseReRg;
+      case Iop_InterleaveHI16x8: 
+         op = Xsse_UNPCKHW; arg1isEReg = True; goto do_SseReRg;
+      case Iop_InterleaveHI32x4: 
+         op = Xsse_UNPCKHD; arg1isEReg = True; goto do_SseReRg;
+      case Iop_InterleaveHI64x2: 
+         op = Xsse_UNPCKHQ; arg1isEReg = True; goto do_SseReRg;
+
+      case Iop_InterleaveLO8x16: 
+         op = Xsse_UNPCKLB; arg1isEReg = True; goto do_SseReRg;
+      case Iop_InterleaveLO16x8: 
+         op = Xsse_UNPCKLW; arg1isEReg = True; goto do_SseReRg;
+      case Iop_InterleaveLO32x4: 
+         op = Xsse_UNPCKLD; arg1isEReg = True; goto do_SseReRg;
+      case Iop_InterleaveLO64x2: 
+         op = Xsse_UNPCKLQ; arg1isEReg = True; goto do_SseReRg;
+
+      case Iop_AndV128:    op = Xsse_AND;      goto do_SseReRg;
+      case Iop_OrV128:     op = Xsse_OR;       goto do_SseReRg;
+      case Iop_XorV128:    op = Xsse_XOR;      goto do_SseReRg;
+      case Iop_Add8x16:    op = Xsse_ADD8;     goto do_SseReRg;
+      case Iop_Add16x8:    op = Xsse_ADD16;    goto do_SseReRg;
+      case Iop_Add32x4:    op = Xsse_ADD32;    goto do_SseReRg;
+      case Iop_Add64x2:    op = Xsse_ADD64;    goto do_SseReRg;
+      case Iop_QAdd8Sx16:  op = Xsse_QADD8S;   goto do_SseReRg;
+      case Iop_QAdd16Sx8:  op = Xsse_QADD16S;  goto do_SseReRg;
+      case Iop_QAdd8Ux16:  op = Xsse_QADD8U;   goto do_SseReRg;
+      case Iop_QAdd16Ux8:  op = Xsse_QADD16U;  goto do_SseReRg;
+      case Iop_Avg8Ux16:   op = Xsse_AVG8U;    goto do_SseReRg;
+      case Iop_Avg16Ux8:   op = Xsse_AVG16U;   goto do_SseReRg;
+      case Iop_CmpEQ8x16:  op = Xsse_CMPEQ8;   goto do_SseReRg;
+      case Iop_CmpEQ16x8:  op = Xsse_CMPEQ16;  goto do_SseReRg;
+      case Iop_CmpEQ32x4:  op = Xsse_CMPEQ32;  goto do_SseReRg;
+      case Iop_CmpGT8Sx16: op = Xsse_CMPGT8S;  goto do_SseReRg;
+      case Iop_CmpGT16Sx8: op = Xsse_CMPGT16S; goto do_SseReRg;
+      case Iop_CmpGT32Sx4: op = Xsse_CMPGT32S; goto do_SseReRg;
+      case Iop_Max16Sx8:   op = Xsse_MAX16S;   goto do_SseReRg;
+      case Iop_Max8Ux16:   op = Xsse_MAX8U;    goto do_SseReRg;
+      case Iop_Min16Sx8:   op = Xsse_MIN16S;   goto do_SseReRg;
+      case Iop_Min8Ux16:   op = Xsse_MIN8U;    goto do_SseReRg;
+      case Iop_MulHi16Ux8: op = Xsse_MULHI16U; goto do_SseReRg;
+      case Iop_MulHi16Sx8: op = Xsse_MULHI16S; goto do_SseReRg;
+      case Iop_Mul16x8:    op = Xsse_MUL16;    goto do_SseReRg;
+      case Iop_Sub8x16:    op = Xsse_SUB8;     goto do_SseReRg;
+      case Iop_Sub16x8:    op = Xsse_SUB16;    goto do_SseReRg;
+      case Iop_Sub32x4:    op = Xsse_SUB32;    goto do_SseReRg;
+      case Iop_Sub64x2:    op = Xsse_SUB64;    goto do_SseReRg;
+      case Iop_QSub8Sx16:  op = Xsse_QSUB8S;   goto do_SseReRg;
+      case Iop_QSub16Sx8:  op = Xsse_QSUB16S;  goto do_SseReRg;
+      case Iop_QSub8Ux16:  op = Xsse_QSUB8U;   goto do_SseReRg;
+      case Iop_QSub16Ux8:  op = Xsse_QSUB16U;  goto do_SseReRg;
+      do_SseReRg: {
+         HReg arg1 = iselVecExpr(env, e->Iex.Binop.arg1);
+         HReg arg2 = iselVecExpr(env, e->Iex.Binop.arg2);
+         HReg dst = newVRegV(env);
+         if (op != Xsse_OR && op != Xsse_AND && op != Xsse_XOR)
+            REQUIRE_SSE2;
+         if (arg1isEReg) {
+            addInstr(env, mk_vMOVsd_RR(arg2, dst));
+            addInstr(env, X86Instr_SseReRg(op, arg1, dst));
+         } else {
+            addInstr(env, mk_vMOVsd_RR(arg1, dst));
+            addInstr(env, X86Instr_SseReRg(op, arg2, dst));
+         }
+         return dst;
+      }
+
+      case Iop_ShlN16x8: op = Xsse_SHL16; goto do_SseShift;
+      case Iop_ShlN32x4: op = Xsse_SHL32; goto do_SseShift;
+      case Iop_ShlN64x2: op = Xsse_SHL64; goto do_SseShift;
+      case Iop_SarN16x8: op = Xsse_SAR16; goto do_SseShift;
+      case Iop_SarN32x4: op = Xsse_SAR32; goto do_SseShift;
+      case Iop_ShrN16x8: op = Xsse_SHR16; goto do_SseShift;
+      case Iop_ShrN32x4: op = Xsse_SHR32; goto do_SseShift;
+      case Iop_ShrN64x2: op = Xsse_SHR64; goto do_SseShift;
+      do_SseShift: {
+         HReg      greg = iselVecExpr(env, e->Iex.Binop.arg1);
+         X86RMI*   rmi  = iselIntExpr_RMI(env, e->Iex.Binop.arg2);
+         X86AMode* esp0 = X86AMode_IR(0, hregX86_ESP());
+         HReg      ereg = newVRegV(env);
+         HReg      dst  = newVRegV(env);
+         REQUIRE_SSE2;
+         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
+         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
+         addInstr(env, X86Instr_Push(X86RMI_Imm(0)));
+         addInstr(env, X86Instr_Push(rmi));
+         addInstr(env, X86Instr_SseLdSt(True/*load*/, ereg, esp0));
+	 addInstr(env, mk_vMOVsd_RR(greg, dst));
+         addInstr(env, X86Instr_SseReRg(op, ereg, dst));
+         add_to_esp(env, 16);
+         return dst;
+      }
+
+      default:
+         break;
+   } /* switch (e->Iex.Binop.op) */
+   } /* if (e->tag == Iex_Binop) */
+
+   if (e->tag == Iex_Mux0X) {
+      X86RM* r8 = iselIntExpr_RM(env, e->Iex.Mux0X.cond);
+      HReg rX  = iselVecExpr(env, e->Iex.Mux0X.exprX);
+      HReg r0  = iselVecExpr(env, e->Iex.Mux0X.expr0);
+      HReg dst = newVRegV(env);
+      addInstr(env, mk_vMOVsd_RR(rX,dst));
+      addInstr(env, X86Instr_Test32(0xFF, r8));
+      addInstr(env, X86Instr_SseCMov(Xcc_Z,r0,dst));
+      return dst;
+   }
+
+   vec_fail:
+   vex_printf("iselVecExpr (hwcaps = %s): can't reduce\n",
+              LibVEX_ppVexHwCaps(VexArchX86,env->hwcaps));
+   ppIRExpr(e);
+   vpanic("iselVecExpr_wrk");
+
+#  undef REQUIRE_SSE1
+#  undef REQUIRE_SSE2
+#  undef SSE2_OR_ABOVE
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Statements                                  ---*/
+/*---------------------------------------------------------*/
+
+static void iselStmt ( ISelEnv* env, IRStmt* stmt )
+{
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      vex_printf("\n-- ");
+      ppIRStmt(stmt);
+      vex_printf("\n");
+   }
+
+   switch (stmt->tag) {
+
+   /* --------- STORE --------- */
+   case Ist_Store: {
+      IRType    tya   = typeOfIRExpr(env->type_env, stmt->Ist.Store.addr);
+      IRType    tyd   = typeOfIRExpr(env->type_env, stmt->Ist.Store.data);
+      IREndness end   = stmt->Ist.Store.end;
+
+      if (tya != Ity_I32 || end != Iend_LE) 
+         goto stmt_fail;
+
+      if (tyd == Ity_I32) {
+         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
+         X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Store.data);
+         addInstr(env, X86Instr_Alu32M(Xalu_MOV,ri,am));
+         return;
+      }
+      if (tyd == Ity_I8 || tyd == Ity_I16) {
+         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
+         HReg r = iselIntExpr_R(env, stmt->Ist.Store.data);
+         addInstr(env, X86Instr_Store( toUChar(tyd==Ity_I8 ? 1 : 2),
+                                       r,am ));
+         return;
+      }
+      if (tyd == Ity_F64) {
+         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
+         HReg r = iselDblExpr(env, stmt->Ist.Store.data);
+         addInstr(env, X86Instr_FpLdSt(False/*store*/, 8, r, am));
+         return;
+      }
+      if (tyd == Ity_F32) {
+         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
+         HReg r = iselFltExpr(env, stmt->Ist.Store.data);
+         addInstr(env, X86Instr_FpLdSt(False/*store*/, 4, r, am));
+         return;
+      }
+      if (tyd == Ity_I64) {
+         HReg vHi, vLo, rA;
+         iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Store.data);
+         rA = iselIntExpr_R(env, stmt->Ist.Store.addr);
+         addInstr(env, X86Instr_Alu32M(
+                          Xalu_MOV, X86RI_Reg(vLo), X86AMode_IR(0, rA)));
+         addInstr(env, X86Instr_Alu32M(
+                          Xalu_MOV, X86RI_Reg(vHi), X86AMode_IR(4, rA)));
+         return;
+      }
+      if (tyd == Ity_V128) {
+         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.Store.addr);
+         HReg r = iselVecExpr(env, stmt->Ist.Store.data);
+         addInstr(env, X86Instr_SseLdSt(False/*store*/, r, am));
+         return;
+      }
+      break;
+   }
+
+   /* --------- PUT --------- */
+   case Ist_Put: {
+      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.Put.data);
+      if (ty == Ity_I32) {
+         /* We're going to write to memory, so compute the RHS into an
+            X86RI. */
+         X86RI* ri = iselIntExpr_RI(env, stmt->Ist.Put.data);
+         addInstr(env,
+                  X86Instr_Alu32M(
+                     Xalu_MOV,
+                     ri,
+                     X86AMode_IR(stmt->Ist.Put.offset,hregX86_EBP())
+                 ));
+         return;
+      }
+      if (ty == Ity_I8 || ty == Ity_I16) {
+         HReg r = iselIntExpr_R(env, stmt->Ist.Put.data);
+         addInstr(env, X86Instr_Store(
+                          toUChar(ty==Ity_I8 ? 1 : 2),
+                          r,
+                          X86AMode_IR(stmt->Ist.Put.offset,
+                                      hregX86_EBP())));
+         return;
+      }
+      if (ty == Ity_I64) {
+         HReg vHi, vLo;
+         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
+         X86AMode* am4 = advance4(am);
+         iselInt64Expr(&vHi, &vLo, env, stmt->Ist.Put.data);
+         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vLo), am ));
+         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(vHi), am4 ));
+         return;
+      }
+      if (ty == Ity_V128) {
+         HReg      vec = iselVecExpr(env, stmt->Ist.Put.data);
+         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
+         addInstr(env, X86Instr_SseLdSt(False/*store*/, vec, am));
+         return;
+      }
+      if (ty == Ity_F32) {
+         HReg f32 = iselFltExpr(env, stmt->Ist.Put.data);
+         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
+         set_FPU_rounding_default(env); /* paranoia */
+         addInstr(env, X86Instr_FpLdSt( False/*store*/, 4, f32, am ));
+         return;
+      }
+      if (ty == Ity_F64) {
+         HReg f64 = iselDblExpr(env, stmt->Ist.Put.data);
+         X86AMode* am  = X86AMode_IR(stmt->Ist.Put.offset, hregX86_EBP());
+         set_FPU_rounding_default(env); /* paranoia */
+         addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, f64, am ));
+         return;
+      }
+      break;
+   }
+
+   /* --------- Indexed PUT --------- */
+   case Ist_PutI: {
+      X86AMode* am 
+         = genGuestArrayOffset(
+              env, stmt->Ist.PutI.descr, 
+                   stmt->Ist.PutI.ix, stmt->Ist.PutI.bias );
+
+      IRType ty = typeOfIRExpr(env->type_env, stmt->Ist.PutI.data);
+      if (ty == Ity_F64) {
+         HReg val = iselDblExpr(env, stmt->Ist.PutI.data);
+         addInstr(env, X86Instr_FpLdSt( False/*store*/, 8, val, am ));
+         return;
+      }
+      if (ty == Ity_I8) {
+         HReg r = iselIntExpr_R(env, stmt->Ist.PutI.data);
+         addInstr(env, X86Instr_Store( 1, r, am ));
+         return;
+      }
+      if (ty == Ity_I32) {
+         HReg r = iselIntExpr_R(env, stmt->Ist.PutI.data);
+         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(r), am ));
+         return;
+      }
+      if (ty == Ity_I64) {
+         HReg rHi, rLo;
+         X86AMode* am4 = advance4(am);
+         iselInt64Expr(&rHi, &rLo, env, stmt->Ist.PutI.data);
+         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rLo), am ));
+         addInstr(env, X86Instr_Alu32M( Xalu_MOV, X86RI_Reg(rHi), am4 ));
+         return;
+      }
+      break;
+   }
+
+   /* --------- TMP --------- */
+   case Ist_WrTmp: {
+      IRTemp tmp = stmt->Ist.WrTmp.tmp;
+      IRType ty = typeOfIRTemp(env->type_env, tmp);
+
+      /* optimisation: if stmt->Ist.WrTmp.data is Add32(..,..),
+         compute it into an AMode and then use LEA.  This usually
+         produces fewer instructions, often because (for memcheck
+         created IR) we get t = address-expression, (t is later used
+         twice) and so doing this naturally turns address-expression
+         back into an X86 amode. */
+      if (ty == Ity_I32 
+          && stmt->Ist.WrTmp.data->tag == Iex_Binop
+          && stmt->Ist.WrTmp.data->Iex.Binop.op == Iop_Add32) {
+         X86AMode* am = iselIntExpr_AMode(env, stmt->Ist.WrTmp.data);
+         HReg dst = lookupIRTemp(env, tmp);
+         if (am->tag == Xam_IR && am->Xam.IR.imm == 0) {
+            /* Hmm, iselIntExpr_AMode wimped out and just computed the
+               value into a register.  Just emit a normal reg-reg move
+               so reg-alloc can coalesce it away in the usual way. */
+            HReg src = am->Xam.IR.reg;
+            addInstr(env, X86Instr_Alu32R(Xalu_MOV, X86RMI_Reg(src), dst));
+         } else {
+            addInstr(env, X86Instr_Lea32(am,dst));
+         }
+         return;
+      }
+
+      if (ty == Ity_I32 || ty == Ity_I16 || ty == Ity_I8) {
+         X86RMI* rmi = iselIntExpr_RMI(env, stmt->Ist.WrTmp.data);
+         HReg dst = lookupIRTemp(env, tmp);
+         addInstr(env, X86Instr_Alu32R(Xalu_MOV,rmi,dst));
+         return;
+      }
+      if (ty == Ity_I64) {
+         HReg rHi, rLo, dstHi, dstLo;
+         iselInt64Expr(&rHi,&rLo, env, stmt->Ist.WrTmp.data);
+         lookupIRTemp64( &dstHi, &dstLo, env, tmp);
+         addInstr(env, mk_iMOVsd_RR(rHi,dstHi) );
+         addInstr(env, mk_iMOVsd_RR(rLo,dstLo) );
+         return;
+      }
+      if (ty == Ity_I1) {
+         X86CondCode cond = iselCondCode(env, stmt->Ist.WrTmp.data);
+         HReg dst = lookupIRTemp(env, tmp);
+         addInstr(env, X86Instr_Set32(cond, dst));
+         return;
+      }
+      if (ty == Ity_F64) {
+         HReg dst = lookupIRTemp(env, tmp);
+         HReg src = iselDblExpr(env, stmt->Ist.WrTmp.data);
+         addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
+         return;
+      }
+      if (ty == Ity_F32) {
+         HReg dst = lookupIRTemp(env, tmp);
+         HReg src = iselFltExpr(env, stmt->Ist.WrTmp.data);
+         addInstr(env, X86Instr_FpUnary(Xfp_MOV,src,dst));
+         return;
+      }
+      if (ty == Ity_V128) {
+         HReg dst = lookupIRTemp(env, tmp);
+         HReg src = iselVecExpr(env, stmt->Ist.WrTmp.data);
+         addInstr(env, mk_vMOVsd_RR(src,dst));
+         return;
+      }
+      break;
+   }
+
+   /* --------- Call to DIRTY helper --------- */
+   case Ist_Dirty: {
+      IRType   retty;
+      IRDirty* d = stmt->Ist.Dirty.details;
+      Bool     passBBP = False;
+
+      if (d->nFxState == 0)
+         vassert(!d->needsBBP);
+
+      passBBP = toBool(d->nFxState > 0 && d->needsBBP);
+
+      /* Marshal args, do the call, clear stack. */
+      doHelperCall( env, passBBP, d->guard, d->cee, d->args );
+
+      /* Now figure out what to do with the returned value, if any. */
+      if (d->tmp == IRTemp_INVALID)
+         /* No return value.  Nothing to do. */
+         return;
+
+      retty = typeOfIRTemp(env->type_env, d->tmp);
+      if (retty == Ity_I64) {
+         HReg dstHi, dstLo;
+         /* The returned value is in %edx:%eax.  Park it in the
+            register-pair associated with tmp. */
+         lookupIRTemp64( &dstHi, &dstLo, env, d->tmp);
+         addInstr(env, mk_iMOVsd_RR(hregX86_EDX(),dstHi) );
+         addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dstLo) );
+         return;
+      }
+      if (retty == Ity_I32 || retty == Ity_I16 || retty == Ity_I8) {
+         /* The returned value is in %eax.  Park it in the register
+            associated with tmp. */
+         HReg dst = lookupIRTemp(env, d->tmp);
+         addInstr(env, mk_iMOVsd_RR(hregX86_EAX(),dst) );
+         return;
+      }
+      break;
+   }
+
+   /* --------- MEM FENCE --------- */
+   case Ist_MBE:
+      switch (stmt->Ist.MBE.event) {
+         case Imbe_Fence:
+            addInstr(env, X86Instr_MFence(env->hwcaps));
+            return;
+         default:
+            break;
+      }
+      break;
+
+   /* --------- ACAS --------- */
+   case Ist_CAS:
+      if (stmt->Ist.CAS.details->oldHi == IRTemp_INVALID) {
+         /* "normal" singleton CAS */
+         UChar  sz;
+         IRCAS* cas = stmt->Ist.CAS.details;
+         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
+         /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
+         X86AMode* am = iselIntExpr_AMode(env, cas->addr);
+         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
+         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
+         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
+         vassert(cas->expdHi == NULL);
+         vassert(cas->dataHi == NULL);
+         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
+         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
+         addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
+         switch (ty) { 
+            case Ity_I32: sz = 4; break;
+            case Ity_I16: sz = 2; break;
+            case Ity_I8:  sz = 1; break; 
+            default: goto unhandled_cas;
+         }
+         addInstr(env, X86Instr_ACAS(am, sz));
+         addInstr(env,
+                  X86Instr_CMov32(Xcc_NZ,
+                                  X86RM_Reg(hregX86_EAX()), rOldLo));
+         return;
+      } else {
+         /* double CAS */
+         IRCAS* cas = stmt->Ist.CAS.details;
+         IRType ty  = typeOfIRExpr(env->type_env, cas->dataLo);
+         /* only 32-bit allowed in this case */
+         /* get: cas->expdLo into %eax, and cas->dataLo into %ebx */
+         /* get: cas->expdHi into %edx, and cas->dataHi into %ecx */
+         X86AMode* am = iselIntExpr_AMode(env, cas->addr);
+         HReg rDataHi = iselIntExpr_R(env, cas->dataHi);
+         HReg rDataLo = iselIntExpr_R(env, cas->dataLo);
+         HReg rExpdHi = iselIntExpr_R(env, cas->expdHi);
+         HReg rExpdLo = iselIntExpr_R(env, cas->expdLo);
+         HReg rOldHi  = lookupIRTemp(env, cas->oldHi);
+         HReg rOldLo  = lookupIRTemp(env, cas->oldLo);
+         if (ty != Ity_I32)
+            goto unhandled_cas;
+         addInstr(env, mk_iMOVsd_RR(rExpdHi, rOldHi));
+         addInstr(env, mk_iMOVsd_RR(rExpdLo, rOldLo));
+         addInstr(env, mk_iMOVsd_RR(rExpdHi, hregX86_EDX()));
+         addInstr(env, mk_iMOVsd_RR(rExpdLo, hregX86_EAX()));
+         addInstr(env, mk_iMOVsd_RR(rDataHi, hregX86_ECX()));
+         addInstr(env, mk_iMOVsd_RR(rDataLo, hregX86_EBX()));
+         addInstr(env, X86Instr_DACAS(am));
+         addInstr(env,
+                  X86Instr_CMov32(Xcc_NZ,
+                                  X86RM_Reg(hregX86_EDX()), rOldHi));
+         addInstr(env,
+                  X86Instr_CMov32(Xcc_NZ,
+                                  X86RM_Reg(hregX86_EAX()), rOldLo));
+         return;
+      }
+      unhandled_cas:
+      break;
+
+   /* --------- INSTR MARK --------- */
+   /* Doesn't generate any executable code ... */
+   case Ist_IMark:
+       return;
+
+   /* --------- NO-OP --------- */
+   /* Fairly self-explanatory, wouldn't you say? */
+   case Ist_NoOp:
+       return;
+
+   /* --------- EXIT --------- */
+   case Ist_Exit: {
+      X86RI*      dst;
+      X86CondCode cc;
+      if (stmt->Ist.Exit.dst->tag != Ico_U32)
+         vpanic("isel_x86: Ist_Exit: dst is not a 32-bit value");
+      dst = iselIntExpr_RI(env, IRExpr_Const(stmt->Ist.Exit.dst));
+      cc  = iselCondCode(env,stmt->Ist.Exit.guard);
+      addInstr(env, X86Instr_Goto(stmt->Ist.Exit.jk, cc, dst));
+      return;
+   }
+
+   default: break;
+   }
+  stmt_fail:
+   ppIRStmt(stmt);
+   vpanic("iselStmt");
+}
+
+
+/*---------------------------------------------------------*/
+/*--- ISEL: Basic block terminators (Nexts)             ---*/
+/*---------------------------------------------------------*/
+
+static void iselNext ( ISelEnv* env, IRExpr* next, IRJumpKind jk )
+{
+   X86RI* ri;
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      vex_printf("\n-- goto {");
+      ppIRJumpKind(jk);
+      vex_printf("} ");
+      ppIRExpr(next);
+      vex_printf("\n");
+   }
+   ri = iselIntExpr_RI(env, next);
+   addInstr(env, X86Instr_Goto(jk, Xcc_ALWAYS,ri));
+}
+
+
+/*---------------------------------------------------------*/
+/*--- Insn selector top-level                           ---*/
+/*---------------------------------------------------------*/
+
+/* Translate an entire SB to x86 code. */
+
+HInstrArray* iselSB_X86 ( IRSB* bb, VexArch      arch_host,
+                                    VexArchInfo* archinfo_host,
+                                    VexAbiInfo*  vbi/*UNUSED*/ )
+{
+   Int      i, j;
+   HReg     hreg, hregHI;
+   ISelEnv* env;
+   UInt     hwcaps_host = archinfo_host->hwcaps;
+
+   /* sanity ... */
+   vassert(arch_host == VexArchX86);
+   vassert(0 == (hwcaps_host
+                 & ~(VEX_HWCAPS_X86_SSE1
+                     | VEX_HWCAPS_X86_SSE2
+                     | VEX_HWCAPS_X86_SSE3
+                     | VEX_HWCAPS_X86_LZCNT)));
+
+   /* Make up an initial environment to use. */
+   env = LibVEX_Alloc(sizeof(ISelEnv));
+   env->vreg_ctr = 0;
+
+   /* Set up output code array. */
+   env->code = newHInstrArray();
+
+   /* Copy BB's type env. */
+   env->type_env = bb->tyenv;
+
+   /* Make up an IRTemp -> virtual HReg mapping.  This doesn't
+      change as we go along. */
+   env->n_vregmap = bb->tyenv->types_used;
+   env->vregmap   = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
+   env->vregmapHI = LibVEX_Alloc(env->n_vregmap * sizeof(HReg));
+
+   /* and finally ... */
+   env->hwcaps = hwcaps_host;
+
+   /* For each IR temporary, allocate a suitably-kinded virtual
+      register. */
+   j = 0;
+   for (i = 0; i < env->n_vregmap; i++) {
+      hregHI = hreg = INVALID_HREG;
+      switch (bb->tyenv->types[i]) {
+         case Ity_I1:
+         case Ity_I8:
+         case Ity_I16:
+         case Ity_I32:  hreg   = mkHReg(j++, HRcInt32, True); break;
+         case Ity_I64:  hreg   = mkHReg(j++, HRcInt32, True);
+                        hregHI = mkHReg(j++, HRcInt32, True); break;
+         case Ity_F32:
+         case Ity_F64:  hreg   = mkHReg(j++, HRcFlt64, True); break;
+         case Ity_V128: hreg   = mkHReg(j++, HRcVec128, True); break;
+         default: ppIRType(bb->tyenv->types[i]);
+                  vpanic("iselBB: IRTemp type");
+      }
+      env->vregmap[i]   = hreg;
+      env->vregmapHI[i] = hregHI;
+   }
+   env->vreg_ctr = j;
+
+   /* Ok, finally we can iterate over the statements. */
+   for (i = 0; i < bb->stmts_used; i++)
+      iselStmt(env,bb->stmts[i]);
+
+   iselNext(env,bb->next,bb->jumpkind);
+
+   /* record the number of vregs we used. */
+   env->code->n_vregs = env->vreg_ctr;
+   return env->code;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                     host_x86_isel.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/ir_defs.c b/VEX/priv/ir_defs.c
new file mode 100644
index 0000000..f78db10
--- /dev/null
+++ b/VEX/priv/ir_defs.c

@@ -0,0 +1,3501 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                         ir_defs.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+
+#include "main_util.h"
+
+
+/*---------------------------------------------------------------*/
+/*--- Printing the IR                                         ---*/
+/*---------------------------------------------------------------*/
+
+void ppIRType ( IRType ty )
+{
+   switch (ty) {
+      case Ity_INVALID: vex_printf("Ity_INVALID"); break;
+      case Ity_I1:      vex_printf( "I1");   break;
+      case Ity_I8:      vex_printf( "I8");   break;
+      case Ity_I16:     vex_printf( "I16");  break;
+      case Ity_I32:     vex_printf( "I32");  break;
+      case Ity_I64:     vex_printf( "I64");  break;
+      case Ity_I128:    vex_printf( "I128"); break;
+      case Ity_F32:     vex_printf( "F32");  break;
+      case Ity_F64:     vex_printf( "F64");  break;
+      case Ity_V128:    vex_printf( "V128"); break;
+      default: vex_printf("ty = 0x%x\n", (Int)ty);
+               vpanic("ppIRType");
+   }
+}
+
+void ppIRConst ( IRConst* con )
+{
+   union { ULong i64; Double f64; } u;
+   vassert(sizeof(ULong) == sizeof(Double));
+   switch (con->tag) {
+      case Ico_U1:   vex_printf( "%d:I1",        con->Ico.U1 ? 1 : 0); break;
+      case Ico_U8:   vex_printf( "0x%x:I8",      (UInt)(con->Ico.U8)); break;
+      case Ico_U16:  vex_printf( "0x%x:I16",     (UInt)(con->Ico.U16)); break;
+      case Ico_U32:  vex_printf( "0x%x:I32",     (UInt)(con->Ico.U32)); break;
+      case Ico_U64:  vex_printf( "0x%llx:I64",   (ULong)(con->Ico.U64)); break;
+      case Ico_F64:  u.f64 = con->Ico.F64;
+                     vex_printf( "F64{0x%llx}",  u.i64);
+                     break;
+      case Ico_F64i: vex_printf( "F64i{0x%llx}", con->Ico.F64i); break;
+      case Ico_V128: vex_printf( "V128{0x%04x}", (UInt)(con->Ico.V128)); break;
+      default: vpanic("ppIRConst");
+   }
+}
+
+void ppIRCallee ( IRCallee* ce )
+{
+   vex_printf("%s", ce->name);
+   if (ce->regparms > 0)
+      vex_printf("[rp=%d]", ce->regparms);
+   if (ce->mcx_mask > 0)
+      vex_printf("[mcx=0x%x]", ce->mcx_mask);
+   vex_printf("{%p}", (void*)ce->addr);
+}
+
+void ppIRRegArray ( IRRegArray* arr )
+{
+   vex_printf("(%d:%dx", arr->base, arr->nElems);
+   ppIRType(arr->elemTy);
+   vex_printf(")");
+}
+
+void ppIRTemp ( IRTemp tmp )
+{
+   if (tmp == IRTemp_INVALID)
+      vex_printf("IRTemp_INVALID");
+   else
+      vex_printf( "t%d", (Int)tmp);
+}
+
+void ppIROp ( IROp op )
+{
+   HChar* str = NULL; 
+   IROp   base;
+   switch (op) {
+      case Iop_Add8 ... Iop_Add64:
+         str = "Add"; base = Iop_Add8; break;
+      case Iop_Sub8 ... Iop_Sub64:
+         str = "Sub"; base = Iop_Sub8; break;
+      case Iop_Mul8 ... Iop_Mul64:
+         str = "Mul"; base = Iop_Mul8; break;
+      case Iop_Or8 ... Iop_Or64:
+         str = "Or"; base = Iop_Or8; break;
+      case Iop_And8 ... Iop_And64:
+         str = "And"; base = Iop_And8; break;
+      case Iop_Xor8 ... Iop_Xor64:
+         str = "Xor"; base = Iop_Xor8; break;
+      case Iop_Shl8 ... Iop_Shl64:
+         str = "Shl"; base = Iop_Shl8; break;
+      case Iop_Shr8 ... Iop_Shr64:
+         str = "Shr"; base = Iop_Shr8; break;
+      case Iop_Sar8 ... Iop_Sar64:
+         str = "Sar"; base = Iop_Sar8; break;
+      case Iop_CmpEQ8 ... Iop_CmpEQ64:
+         str = "CmpEQ"; base = Iop_CmpEQ8; break;
+      case Iop_CmpNE8 ... Iop_CmpNE64:
+         str = "CmpNE"; base = Iop_CmpNE8; break;
+      case Iop_CasCmpEQ8 ... Iop_CasCmpEQ64:
+         str = "CasCmpEQ"; base = Iop_CasCmpEQ8; break;
+      case Iop_CasCmpNE8 ... Iop_CasCmpNE64:
+         str = "CasCmpNE"; base = Iop_CasCmpNE8; break;
+      case Iop_Not8 ... Iop_Not64:
+         str = "Not"; base = Iop_Not8; break;
+      /* other cases must explicitly "return;" */
+      case Iop_8Uto16:   vex_printf("8Uto16");  return;
+      case Iop_8Uto32:   vex_printf("8Uto32");  return;
+      case Iop_16Uto32:  vex_printf("16Uto32"); return;
+      case Iop_8Sto16:   vex_printf("8Sto16");  return;
+      case Iop_8Sto32:   vex_printf("8Sto32");  return;
+      case Iop_16Sto32:  vex_printf("16Sto32"); return;
+      case Iop_32Sto64:  vex_printf("32Sto64"); return;
+      case Iop_32Uto64:  vex_printf("32Uto64"); return;
+      case Iop_32to8:    vex_printf("32to8");   return;
+      case Iop_16Uto64:  vex_printf("16Uto64"); return;
+      case Iop_16Sto64:  vex_printf("16Sto64"); return;
+      case Iop_8Uto64:   vex_printf("8Uto64"); return;
+      case Iop_8Sto64:   vex_printf("8Sto64"); return;
+      case Iop_64to16:   vex_printf("64to16"); return;
+      case Iop_64to8:    vex_printf("64to8");  return;
+
+      case Iop_Not1:     vex_printf("Not1");    return;
+      case Iop_32to1:    vex_printf("32to1");   return;
+      case Iop_64to1:    vex_printf("64to1");   return;
+      case Iop_1Uto8:    vex_printf("1Uto8");   return;
+      case Iop_1Uto32:   vex_printf("1Uto32");  return;
+      case Iop_1Uto64:   vex_printf("1Uto64");  return;
+      case Iop_1Sto8:    vex_printf("1Sto8");  return;
+      case Iop_1Sto16:   vex_printf("1Sto16");  return;
+      case Iop_1Sto32:   vex_printf("1Sto32");  return;
+      case Iop_1Sto64:   vex_printf("1Sto64");  return;
+
+      case Iop_MullS8:   vex_printf("MullS8");  return;
+      case Iop_MullS16:  vex_printf("MullS16"); return;
+      case Iop_MullS32:  vex_printf("MullS32"); return;
+      case Iop_MullS64:  vex_printf("MullS64"); return;
+      case Iop_MullU8:   vex_printf("MullU8");  return;
+      case Iop_MullU16:  vex_printf("MullU16"); return;
+      case Iop_MullU32:  vex_printf("MullU32"); return;
+      case Iop_MullU64:  vex_printf("MullU64"); return;
+
+      case Iop_Clz64:    vex_printf("Clz64"); return;
+      case Iop_Clz32:    vex_printf("Clz32"); return;
+      case Iop_Ctz64:    vex_printf("Ctz64"); return;
+      case Iop_Ctz32:    vex_printf("Ctz32"); return;
+
+      case Iop_CmpLT32S: vex_printf("CmpLT32S"); return;
+      case Iop_CmpLE32S: vex_printf("CmpLE32S"); return;
+      case Iop_CmpLT32U: vex_printf("CmpLT32U"); return;
+      case Iop_CmpLE32U: vex_printf("CmpLE32U"); return;
+
+      case Iop_CmpLT64S: vex_printf("CmpLT64S"); return;
+      case Iop_CmpLE64S: vex_printf("CmpLE64S"); return;
+      case Iop_CmpLT64U: vex_printf("CmpLT64U"); return;
+      case Iop_CmpLE64U: vex_printf("CmpLE64U"); return;
+
+      case Iop_CmpNEZ8:  vex_printf("CmpNEZ8"); return;
+      case Iop_CmpNEZ16: vex_printf("CmpNEZ16"); return;
+      case Iop_CmpNEZ32: vex_printf("CmpNEZ32"); return;
+      case Iop_CmpNEZ64: vex_printf("CmpNEZ64"); return;
+
+      case Iop_CmpwNEZ32: vex_printf("CmpwNEZ32"); return;
+      case Iop_CmpwNEZ64: vex_printf("CmpwNEZ64"); return;
+
+      case Iop_Left8:  vex_printf("Left8"); return;
+      case Iop_Left16: vex_printf("Left16"); return;
+      case Iop_Left32: vex_printf("Left32"); return;
+      case Iop_Left64: vex_printf("Left64"); return;
+      case Iop_Max32U: vex_printf("Max32U"); return;
+
+      case Iop_CmpORD32U: vex_printf("CmpORD32U"); return;
+      case Iop_CmpORD32S: vex_printf("CmpORD32S"); return;
+
+      case Iop_CmpORD64U: vex_printf("CmpORD64U"); return;
+      case Iop_CmpORD64S: vex_printf("CmpORD64S"); return;
+
+      case Iop_DivU32: vex_printf("DivU32"); return;
+      case Iop_DivS32: vex_printf("DivS32"); return;
+      case Iop_DivU64: vex_printf("DivU64"); return;
+      case Iop_DivS64: vex_printf("DivS64"); return;
+
+      case Iop_DivModU64to32: vex_printf("DivModU64to32"); return;
+      case Iop_DivModS64to32: vex_printf("DivModS64to32"); return;
+
+      case Iop_DivModU128to64: vex_printf("DivModU128to64"); return;
+      case Iop_DivModS128to64: vex_printf("DivModS128to64"); return;
+
+      case Iop_16HIto8:  vex_printf("16HIto8"); return;
+      case Iop_16to8:    vex_printf("16to8");   return;
+      case Iop_8HLto16:  vex_printf("8HLto16"); return;
+
+      case Iop_32HIto16: vex_printf("32HIto16"); return;
+      case Iop_32to16:   vex_printf("32to16");   return;
+      case Iop_16HLto32: vex_printf("16HLto32"); return;
+
+      case Iop_64HIto32: vex_printf("64HIto32"); return;
+      case Iop_64to32:   vex_printf("64to32");   return;
+      case Iop_32HLto64: vex_printf("32HLto64"); return;
+
+      case Iop_128HIto64: vex_printf("128HIto64"); return;
+      case Iop_128to64:   vex_printf("128to64");   return;
+      case Iop_64HLto128: vex_printf("64HLto128"); return;
+
+      case Iop_AddF64:    vex_printf("AddF64"); return;
+      case Iop_SubF64:    vex_printf("SubF64"); return;
+      case Iop_MulF64:    vex_printf("MulF64"); return;
+      case Iop_DivF64:    vex_printf("DivF64"); return;
+      case Iop_AddF64r32: vex_printf("AddF64r32"); return;
+      case Iop_SubF64r32: vex_printf("SubF64r32"); return;
+      case Iop_MulF64r32: vex_printf("MulF64r32"); return;
+      case Iop_DivF64r32: vex_printf("DivF64r32"); return;
+      case Iop_AddF32:    vex_printf("AddF32"); return;
+      case Iop_SubF32:    vex_printf("SubF32"); return;
+      case Iop_MulF32:    vex_printf("MulF32"); return;
+      case Iop_DivF32:    vex_printf("DivF32"); return;
+
+      case Iop_ScaleF64:      vex_printf("ScaleF64"); return;
+      case Iop_AtanF64:       vex_printf("AtanF64"); return;
+      case Iop_Yl2xF64:       vex_printf("Yl2xF64"); return;
+      case Iop_Yl2xp1F64:     vex_printf("Yl2xp1F64"); return;
+      case Iop_PRemF64:       vex_printf("PRemF64"); return;
+      case Iop_PRemC3210F64:  vex_printf("PRemC3210F64"); return;
+      case Iop_PRem1F64:      vex_printf("PRem1F64"); return;
+      case Iop_PRem1C3210F64: vex_printf("PRem1C3210F64"); return;
+      case Iop_NegF64:        vex_printf("NegF64"); return;
+      case Iop_AbsF64:        vex_printf("AbsF64"); return;
+      case Iop_NegF32:        vex_printf("NegF32"); return;
+      case Iop_AbsF32:        vex_printf("AbsF32"); return;
+      case Iop_SqrtF64:       vex_printf("SqrtF64"); return;
+      case Iop_SqrtF32:       vex_printf("SqrtF32"); return;
+      case Iop_SinF64:    vex_printf("SinF64"); return;
+      case Iop_CosF64:    vex_printf("CosF64"); return;
+      case Iop_TanF64:    vex_printf("TanF64"); return;
+      case Iop_2xm1F64:   vex_printf("2xm1F64"); return;
+
+      case Iop_MAddF64:    vex_printf("MAddF64"); return;
+      case Iop_MSubF64:    vex_printf("MSubF64"); return;
+      case Iop_MAddF64r32: vex_printf("MAddF64r32"); return;
+      case Iop_MSubF64r32: vex_printf("MSubF64r32"); return;
+
+      case Iop_Est5FRSqrt:    vex_printf("Est5FRSqrt"); return;
+      case Iop_RoundF64toF64_NEAREST: vex_printf("RoundF64toF64_NEAREST"); return;
+      case Iop_RoundF64toF64_NegINF: vex_printf("RoundF64toF64_NegINF"); return;
+      case Iop_RoundF64toF64_PosINF: vex_printf("RoundF64toF64_PosINF"); return;
+      case Iop_RoundF64toF64_ZERO: vex_printf("RoundF64toF64_ZERO"); return;
+
+      case Iop_TruncF64asF32: vex_printf("TruncF64asF32"); return;
+      case Iop_CalcFPRF:      vex_printf("CalcFPRF"); return;
+
+      case Iop_Add16x2:   vex_printf("Add16x2"); return;
+      case Iop_Sub16x2:   vex_printf("Sub16x2"); return;
+      case Iop_QAdd16Sx2: vex_printf("QAdd16Sx2"); return;
+      case Iop_QAdd16Ux2: vex_printf("QAdd16Ux2"); return;
+      case Iop_QSub16Sx2: vex_printf("QSub16Sx2"); return;
+      case Iop_QSub16Ux2: vex_printf("QSub16Ux2"); return;
+      case Iop_HAdd16Ux2: vex_printf("HAdd16Ux2"); return;
+      case Iop_HAdd16Sx2: vex_printf("HAdd16Sx2"); return;
+      case Iop_HSub16Ux2: vex_printf("HSub16Ux2"); return;
+      case Iop_HSub16Sx2: vex_printf("HSub16Sx2"); return;
+
+      case Iop_Add8x4:   vex_printf("Add8x4"); return;
+      case Iop_Sub8x4:   vex_printf("Sub8x4"); return;
+      case Iop_QAdd8Sx4: vex_printf("QAdd8Sx4"); return;
+      case Iop_QAdd8Ux4: vex_printf("QAdd8Ux4"); return;
+      case Iop_QSub8Sx4: vex_printf("QSub8Sx4"); return;
+      case Iop_QSub8Ux4: vex_printf("QSub8Ux4"); return;
+      case Iop_HAdd8Ux4: vex_printf("HAdd8Ux4"); return;
+      case Iop_HAdd8Sx4: vex_printf("HAdd8Sx4"); return;
+      case Iop_HSub8Ux4: vex_printf("HSub8Ux4"); return;
+      case Iop_HSub8Sx4: vex_printf("HSub8Sx4"); return;
+      case Iop_Sad8Ux4:  vex_printf("Sad8Ux4"); return;
+
+      case Iop_CmpNEZ16x2: vex_printf("CmpNEZ16x2"); return;
+      case Iop_CmpNEZ8x4:  vex_printf("CmpNEZ8x4"); return;
+
+      case Iop_CmpF64:    vex_printf("CmpF64"); return;
+
+      case Iop_F64toI16S: vex_printf("F64toI16S"); return;
+      case Iop_F64toI32S: vex_printf("F64toI32S"); return;
+      case Iop_F64toI64S: vex_printf("F64toI64S"); return;
+
+      case Iop_F64toI32U: vex_printf("F64toI32U"); return;
+
+      case Iop_I16StoF64: vex_printf("I16StoF64"); return;
+      case Iop_I32StoF64: vex_printf("I32StoF64"); return;
+      case Iop_I64StoF64: vex_printf("I64StoF64"); return;
+
+      case Iop_I32UtoF64: vex_printf("I32UtoF64"); return;
+
+      case Iop_F32toF64: vex_printf("F32toF64"); return;
+      case Iop_F64toF32: vex_printf("F64toF32"); return;
+
+      case Iop_RoundF64toInt: vex_printf("RoundF64toInt"); return;
+      case Iop_RoundF32toInt: vex_printf("RoundF32toInt"); return;
+      case Iop_RoundF64toF32: vex_printf("RoundF64toF32"); return;
+
+      case Iop_ReinterpF64asI64: vex_printf("ReinterpF64asI64"); return;
+      case Iop_ReinterpI64asF64: vex_printf("ReinterpI64asF64"); return;
+      case Iop_ReinterpF32asI32: vex_printf("ReinterpF32asI32"); return;
+      case Iop_ReinterpI32asF32: vex_printf("ReinterpI32asF32"); return;
+
+      case Iop_I32UtoFx4: vex_printf("I32UtoFx4"); return;
+      case Iop_I32StoFx4: vex_printf("I32StoFx4"); return;
+
+      case Iop_F32toF16x4: vex_printf("F32toF16x4"); return;
+      case Iop_F16toF32x4: vex_printf("F16toF32x4"); return;
+
+      case Iop_Rsqrte32Fx4: vex_printf("VRsqrte32Fx4"); return;
+      case Iop_Rsqrte32x4:  vex_printf("VRsqrte32x4"); return;
+      case Iop_Rsqrte32Fx2: vex_printf("VRsqrte32Fx2"); return;
+      case Iop_Rsqrte32x2:  vex_printf("VRsqrte32x2"); return;
+
+      case Iop_QFtoI32Ux4_RZ: vex_printf("QFtoI32Ux4_RZ"); return;
+      case Iop_QFtoI32Sx4_RZ: vex_printf("QFtoI32Sx4_RZ"); return;
+
+      case Iop_FtoI32Ux4_RZ: vex_printf("FtoI32Ux4_RZ"); return;
+      case Iop_FtoI32Sx4_RZ: vex_printf("FtoI32Sx4_RZ"); return;
+
+      case Iop_I32UtoFx2: vex_printf("I32UtoFx2"); return;
+      case Iop_I32StoFx2: vex_printf("I32StoFx2"); return;
+
+      case Iop_FtoI32Ux2_RZ: vex_printf("FtoI32Ux2_RZ"); return;
+      case Iop_FtoI32Sx2_RZ: vex_printf("FtoI32Sx2_RZ"); return;
+
+      case Iop_RoundF32x4_RM: vex_printf("RoundF32x4_RM"); return;
+      case Iop_RoundF32x4_RP: vex_printf("RoundF32x4_RP"); return;
+      case Iop_RoundF32x4_RN: vex_printf("RoundF32x4_RN"); return;
+      case Iop_RoundF32x4_RZ: vex_printf("RoundF32x4_RZ"); return;
+
+      case Iop_Abs8x8: vex_printf("Abs8x8"); return;
+      case Iop_Abs16x4: vex_printf("Abs16x4"); return;
+      case Iop_Abs32x2: vex_printf("Abs32x2"); return;
+      case Iop_Add8x8: vex_printf("Add8x8"); return;
+      case Iop_Add16x4: vex_printf("Add16x4"); return;
+      case Iop_Add32x2: vex_printf("Add32x2"); return;
+      case Iop_QAdd8Ux8: vex_printf("QAdd8Ux8"); return;
+      case Iop_QAdd16Ux4: vex_printf("QAdd16Ux4"); return;
+      case Iop_QAdd32Ux2: vex_printf("QAdd32Ux2"); return;
+      case Iop_QAdd64Ux1: vex_printf("QAdd64Ux1"); return;
+      case Iop_QAdd8Sx8: vex_printf("QAdd8Sx8"); return;
+      case Iop_QAdd16Sx4: vex_printf("QAdd16Sx4"); return;
+      case Iop_QAdd32Sx2: vex_printf("QAdd32Sx2"); return;
+      case Iop_QAdd64Sx1: vex_printf("QAdd64Sx1"); return;
+      case Iop_PwAdd8x8: vex_printf("PwAdd8x8"); return;
+      case Iop_PwAdd16x4: vex_printf("PwAdd16x4"); return;
+      case Iop_PwAdd32x2: vex_printf("PwAdd32x2"); return;
+      case Iop_PwAdd32Fx2: vex_printf("PwAdd32Fx2"); return;
+      case Iop_PwAddL8Ux8: vex_printf("PwAddL8Ux8"); return;
+      case Iop_PwAddL16Ux4: vex_printf("PwAddL16Ux4"); return;
+      case Iop_PwAddL32Ux2: vex_printf("PwAddL32Ux2"); return;
+      case Iop_PwAddL8Sx8: vex_printf("PwAddL8Sx8"); return;
+      case Iop_PwAddL16Sx4: vex_printf("PwAddL16Sx4"); return;
+      case Iop_PwAddL32Sx2: vex_printf("PwAddL32Sx2"); return;
+      case Iop_Sub8x8: vex_printf("Sub8x8"); return;
+      case Iop_Sub16x4: vex_printf("Sub16x4"); return;
+      case Iop_Sub32x2: vex_printf("Sub32x2"); return;
+      case Iop_QSub8Ux8: vex_printf("QSub8Ux8"); return;
+      case Iop_QSub16Ux4: vex_printf("QSub16Ux4"); return;
+      case Iop_QSub32Ux2: vex_printf("QSub32Ux2"); return;
+      case Iop_QSub64Ux1: vex_printf("QSub64Ux1"); return;
+      case Iop_QSub8Sx8: vex_printf("QSub8Sx8"); return;
+      case Iop_QSub16Sx4: vex_printf("QSub16Sx4"); return;
+      case Iop_QSub32Sx2: vex_printf("QSub32Sx2"); return;
+      case Iop_QSub64Sx1: vex_printf("QSub64Sx1"); return;
+      case Iop_Mul8x8: vex_printf("Mul8x8"); return;
+      case Iop_Mul16x4: vex_printf("Mul16x4"); return;
+      case Iop_Mul32x2: vex_printf("Mul32x2"); return;
+      case Iop_Mul32Fx2: vex_printf("Mul32Fx2"); return;
+      case Iop_PolynomialMul8x8: vex_printf("PolynomialMul8x8"); return;
+      case Iop_MulHi16Ux4: vex_printf("MulHi16Ux4"); return;
+      case Iop_MulHi16Sx4: vex_printf("MulHi16Sx4"); return;
+      case Iop_QDMulHi16Sx4: vex_printf("QDMulHi16Sx4"); return;
+      case Iop_QDMulHi32Sx2: vex_printf("QDMulHi32Sx2"); return;
+      case Iop_QRDMulHi16Sx4: vex_printf("QRDMulHi16Sx4"); return;
+      case Iop_QRDMulHi32Sx2: vex_printf("QRDMulHi32Sx2"); return;
+      case Iop_QDMulLong16Sx4: vex_printf("QDMulLong16Sx4"); return;
+      case Iop_QDMulLong32Sx2: vex_printf("QDMulLong32Sx2"); return;
+      case Iop_Avg8Ux8: vex_printf("Avg8Ux8"); return;
+      case Iop_Avg16Ux4: vex_printf("Avg16Ux4"); return;
+      case Iop_Max8Sx8: vex_printf("Max8Sx8"); return;
+      case Iop_Max16Sx4: vex_printf("Max16Sx4"); return;
+      case Iop_Max32Sx2: vex_printf("Max32Sx2"); return;
+      case Iop_Max8Ux8: vex_printf("Max8Ux8"); return;
+      case Iop_Max16Ux4: vex_printf("Max16Ux4"); return;
+      case Iop_Max32Ux2: vex_printf("Max32Ux2"); return;
+      case Iop_Min8Sx8: vex_printf("Min8Sx8"); return;
+      case Iop_Min16Sx4: vex_printf("Min16Sx4"); return;
+      case Iop_Min32Sx2: vex_printf("Min32Sx2"); return;
+      case Iop_Min8Ux8: vex_printf("Min8Ux8"); return;
+      case Iop_Min16Ux4: vex_printf("Min16Ux4"); return;
+      case Iop_Min32Ux2: vex_printf("Min32Ux2"); return;
+      case Iop_PwMax8Sx8: vex_printf("PwMax8Sx8"); return;
+      case Iop_PwMax16Sx4: vex_printf("PwMax16Sx4"); return;
+      case Iop_PwMax32Sx2: vex_printf("PwMax32Sx2"); return;
+      case Iop_PwMax8Ux8: vex_printf("PwMax8Ux8"); return;
+      case Iop_PwMax16Ux4: vex_printf("PwMax16Ux4"); return;
+      case Iop_PwMax32Ux2: vex_printf("PwMax32Ux2"); return;
+      case Iop_PwMin8Sx8: vex_printf("PwMin8Sx8"); return;
+      case Iop_PwMin16Sx4: vex_printf("PwMin16Sx4"); return;
+      case Iop_PwMin32Sx2: vex_printf("PwMin32Sx2"); return;
+      case Iop_PwMin8Ux8: vex_printf("PwMin8Ux8"); return;
+      case Iop_PwMin16Ux4: vex_printf("PwMin16Ux4"); return;
+      case Iop_PwMin32Ux2: vex_printf("PwMin32Ux2"); return;
+      case Iop_CmpEQ8x8: vex_printf("CmpEQ8x8"); return;
+      case Iop_CmpEQ16x4: vex_printf("CmpEQ16x4"); return;
+      case Iop_CmpEQ32x2: vex_printf("CmpEQ32x2"); return;
+      case Iop_CmpGT8Ux8: vex_printf("CmpGT8Ux8"); return;
+      case Iop_CmpGT16Ux4: vex_printf("CmpGT16Ux4"); return;
+      case Iop_CmpGT32Ux2: vex_printf("CmpGT32Ux2"); return;
+      case Iop_CmpGT8Sx8: vex_printf("CmpGT8Sx8"); return;
+      case Iop_CmpGT16Sx4: vex_printf("CmpGT16Sx4"); return;
+      case Iop_CmpGT32Sx2: vex_printf("CmpGT32Sx2"); return;
+      case Iop_Cnt8x8: vex_printf("Cnt8x8"); return;
+      case Iop_Clz8Sx8: vex_printf("Clz8Sx8"); return;
+      case Iop_Clz16Sx4: vex_printf("Clz16Sx4"); return;
+      case Iop_Clz32Sx2: vex_printf("Clz32Sx2"); return;
+      case Iop_Cls8Sx8: vex_printf("Cls8Sx8"); return;
+      case Iop_Cls16Sx4: vex_printf("Cls16Sx4"); return;
+      case Iop_Cls32Sx2: vex_printf("Cls32Sx2"); return;
+      case Iop_ShlN8x8: vex_printf("ShlN8x8"); return;
+      case Iop_ShlN16x4: vex_printf("ShlN16x4"); return;
+      case Iop_ShlN32x2: vex_printf("ShlN32x2"); return;
+      case Iop_ShrN8x8: vex_printf("ShrN8x8"); return;
+      case Iop_ShrN16x4: vex_printf("ShrN16x4"); return;
+      case Iop_ShrN32x2: vex_printf("ShrN32x2"); return;
+      case Iop_SarN8x8: vex_printf("SarN8x8"); return;
+      case Iop_SarN16x4: vex_printf("SarN16x4"); return;
+      case Iop_SarN32x2: vex_printf("SarN32x2"); return;
+      case Iop_QNarrow16Ux4: vex_printf("QNarrow16Ux4"); return;
+      case Iop_QNarrow16Sx4: vex_printf("QNarrow16Sx4"); return;
+      case Iop_QNarrow32Sx2: vex_printf("QNarrow32Sx2"); return;
+      case Iop_InterleaveHI8x8: vex_printf("InterleaveHI8x8"); return;
+      case Iop_InterleaveHI16x4: vex_printf("InterleaveHI16x4"); return;
+      case Iop_InterleaveHI32x2: vex_printf("InterleaveHI32x2"); return;
+      case Iop_InterleaveLO8x8: vex_printf("InterleaveLO8x8"); return;
+      case Iop_InterleaveLO16x4: vex_printf("InterleaveLO16x4"); return;
+      case Iop_InterleaveLO32x2: vex_printf("InterleaveLO32x2"); return;
+      case Iop_CatOddLanes8x8: vex_printf("CatOddLanes8x8"); return;
+      case Iop_CatOddLanes16x4: vex_printf("CatOddLanes16x4"); return;
+      case Iop_CatEvenLanes8x8: vex_printf("CatEvenLanes8x8"); return;
+      case Iop_CatEvenLanes16x4: vex_printf("CatEvenLanes16x4"); return;
+      case Iop_InterleaveOddLanes8x8: vex_printf("InterleaveOddLanes8x8"); return;
+      case Iop_InterleaveOddLanes16x4: vex_printf("InterleaveOddLanes16x4"); return;
+      case Iop_InterleaveEvenLanes8x8: vex_printf("InterleaveEvenLanes8x8"); return;
+      case Iop_InterleaveEvenLanes16x4: vex_printf("InterleaveEvenLanes16x4"); return;
+      case Iop_Shl8x8: vex_printf("Shl8x8"); return;
+      case Iop_Shl16x4: vex_printf("Shl16x4"); return;
+      case Iop_Shl32x2: vex_printf("Shl32x2"); return;
+      case Iop_Shr8x8: vex_printf("Shr8x8"); return;
+      case Iop_Shr16x4: vex_printf("Shr16x4"); return;
+      case Iop_Shr32x2: vex_printf("Shr32x2"); return;
+      case Iop_QShl8x8: vex_printf("QShl8x8"); return;
+      case Iop_QShl16x4: vex_printf("QShl16x4"); return;
+      case Iop_QShl32x2: vex_printf("QShl32x2"); return;
+      case Iop_QShl64x1: vex_printf("QShl64x1"); return;
+      case Iop_QSal8x8: vex_printf("QSal8x8"); return;
+      case Iop_QSal16x4: vex_printf("QSal16x4"); return;
+      case Iop_QSal32x2: vex_printf("QSal32x2"); return;
+      case Iop_QSal64x1: vex_printf("QSal64x1"); return;
+      case Iop_QShlN8x8: vex_printf("QShlN8x8"); return;
+      case Iop_QShlN16x4: vex_printf("QShlN16x4"); return;
+      case Iop_QShlN32x2: vex_printf("QShlN32x2"); return;
+      case Iop_QShlN64x1: vex_printf("QShlN64x1"); return;
+      case Iop_QShlN8Sx8: vex_printf("QShlN8Sx8"); return;
+      case Iop_QShlN16Sx4: vex_printf("QShlN16Sx4"); return;
+      case Iop_QShlN32Sx2: vex_printf("QShlN32Sx2"); return;
+      case Iop_QShlN64Sx1: vex_printf("QShlN64Sx1"); return;
+      case Iop_QSalN8x8: vex_printf("QSalN8x8"); return;
+      case Iop_QSalN16x4: vex_printf("QSalN16x4"); return;
+      case Iop_QSalN32x2: vex_printf("QSalN32x2"); return;
+      case Iop_QSalN64x1: vex_printf("QSalN64x1"); return;
+      case Iop_Sar8x8: vex_printf("Sar8x8"); return;
+      case Iop_Sar16x4: vex_printf("Sar16x4"); return;
+      case Iop_Sar32x2: vex_printf("Sar32x2"); return;
+      case Iop_Sal8x8: vex_printf("Sal8x8"); return;
+      case Iop_Sal16x4: vex_printf("Sal16x4"); return;
+      case Iop_Sal32x2: vex_printf("Sal32x2"); return;
+      case Iop_Sal64x1: vex_printf("Sal64x1"); return;
+      case Iop_Perm8x8: vex_printf("Perm8x8"); return;
+      case Iop_Reverse16_8x8: vex_printf("Reverse16_8x8"); return;
+      case Iop_Reverse32_8x8: vex_printf("Reverse32_8x8"); return;
+      case Iop_Reverse32_16x4: vex_printf("Reverse32_16x4"); return;
+      case Iop_Reverse64_8x8: vex_printf("Reverse64_8x8"); return;
+      case Iop_Reverse64_16x4: vex_printf("Reverse64_16x4"); return;
+      case Iop_Reverse64_32x2: vex_printf("Reverse64_32x2"); return;
+      case Iop_Abs32Fx2: vex_printf("Abs32Fx2"); return;
+
+      case Iop_CmpNEZ32x2: vex_printf("CmpNEZ32x2"); return;
+      case Iop_CmpNEZ16x4: vex_printf("CmpNEZ16x4"); return;
+      case Iop_CmpNEZ8x8:  vex_printf("CmpNEZ8x8"); return;
+
+      case Iop_Add32Fx4:  vex_printf("Add32Fx4"); return;
+      case Iop_Add32Fx2:  vex_printf("Add32Fx2"); return;
+      case Iop_Add32F0x4: vex_printf("Add32F0x4"); return;
+      case Iop_Add64Fx2:  vex_printf("Add64Fx2"); return;
+      case Iop_Add64F0x2: vex_printf("Add64F0x2"); return;
+
+      case Iop_Div32Fx4:  vex_printf("Div32Fx4"); return;
+      case Iop_Div32F0x4: vex_printf("Div32F0x4"); return;
+      case Iop_Div64Fx2:  vex_printf("Div64Fx2"); return;
+      case Iop_Div64F0x2: vex_printf("Div64F0x2"); return;
+
+      case Iop_Max32Fx4:  vex_printf("Max32Fx4"); return;
+      case Iop_Max32Fx2:  vex_printf("Max32Fx2"); return;
+      case Iop_PwMax32Fx4:  vex_printf("PwMax32Fx4"); return;
+      case Iop_PwMax32Fx2:  vex_printf("PwMax32Fx2"); return;
+      case Iop_Max32F0x4: vex_printf("Max32F0x4"); return;
+      case Iop_Max64Fx2:  vex_printf("Max64Fx2"); return;
+      case Iop_Max64F0x2: vex_printf("Max64F0x2"); return;
+
+      case Iop_Min32Fx4:  vex_printf("Min32Fx4"); return;
+      case Iop_Min32Fx2:  vex_printf("Min32Fx2"); return;
+      case Iop_PwMin32Fx4:  vex_printf("PwMin32Fx4"); return;
+      case Iop_PwMin32Fx2:  vex_printf("PwMin32Fx2"); return;
+      case Iop_Min32F0x4: vex_printf("Min32F0x4"); return;
+      case Iop_Min64Fx2:  vex_printf("Min64Fx2"); return;
+      case Iop_Min64F0x2: vex_printf("Min64F0x2"); return;
+
+      case Iop_Mul32Fx4:  vex_printf("Mul32Fx4"); return;
+      case Iop_Mul32F0x4: vex_printf("Mul32F0x4"); return;
+      case Iop_Mul64Fx2:  vex_printf("Mul64Fx2"); return;
+      case Iop_Mul64F0x2: vex_printf("Mul64F0x2"); return;
+
+      case Iop_Recip32x2: vex_printf("Recip32x2"); return;
+      case Iop_Recip32Fx2:  vex_printf("Recip32Fx2"); return;
+      case Iop_Recip32Fx4:  vex_printf("Recip32Fx4"); return;
+      case Iop_Recip32x4:  vex_printf("Recip32x4"); return;
+      case Iop_Recip32F0x4: vex_printf("Recip32F0x4"); return;
+      case Iop_Recip64Fx2:  vex_printf("Recip64Fx2"); return;
+      case Iop_Recip64F0x2: vex_printf("Recip64F0x2"); return;
+      case Iop_Recps32Fx2:  vex_printf("VRecps32Fx2"); return;
+      case Iop_Recps32Fx4:  vex_printf("VRecps32Fx4"); return;
+      case Iop_Abs32Fx4:  vex_printf("Abs32Fx4"); return;
+      case Iop_Rsqrts32Fx4:  vex_printf("VRsqrts32Fx4"); return;
+      case Iop_Rsqrts32Fx2:  vex_printf("VRsqrts32Fx2"); return;
+
+      case Iop_RSqrt32Fx4:  vex_printf("RSqrt32Fx4"); return;
+      case Iop_RSqrt32F0x4: vex_printf("RSqrt32F0x4"); return;
+      case Iop_RSqrt64Fx2:  vex_printf("RSqrt64Fx2"); return;
+      case Iop_RSqrt64F0x2: vex_printf("RSqrt64F0x2"); return;
+
+      case Iop_Sqrt32Fx4:  vex_printf("Sqrt32Fx4"); return;
+      case Iop_Sqrt32F0x4: vex_printf("Sqrt32F0x4"); return;
+      case Iop_Sqrt64Fx2:  vex_printf("Sqrt64Fx2"); return;
+      case Iop_Sqrt64F0x2: vex_printf("Sqrt64F0x2"); return;
+
+      case Iop_Sub32Fx4:  vex_printf("Sub32Fx4"); return;
+      case Iop_Sub32Fx2:  vex_printf("Sub32Fx2"); return;
+      case Iop_Sub32F0x4: vex_printf("Sub32F0x4"); return;
+      case Iop_Sub64Fx2:  vex_printf("Sub64Fx2"); return;
+      case Iop_Sub64F0x2: vex_printf("Sub64F0x2"); return;
+
+      case Iop_CmpEQ32Fx4: vex_printf("CmpEQ32Fx4"); return;
+      case Iop_CmpLT32Fx4: vex_printf("CmpLT32Fx4"); return;
+      case Iop_CmpLE32Fx4: vex_printf("CmpLE32Fx4"); return;
+      case Iop_CmpGT32Fx4: vex_printf("CmpGT32Fx4"); return;
+      case Iop_CmpGE32Fx4: vex_printf("CmpGE32Fx4"); return;
+      case Iop_CmpUN32Fx4: vex_printf("CmpUN32Fx4"); return;
+      case Iop_CmpEQ64Fx2: vex_printf("CmpEQ64Fx2"); return;
+      case Iop_CmpLT64Fx2: vex_printf("CmpLT64Fx2"); return;
+      case Iop_CmpLE64Fx2: vex_printf("CmpLE64Fx2"); return;
+      case Iop_CmpUN64Fx2: vex_printf("CmpUN64Fx2"); return;
+      case Iop_CmpGT32Fx2: vex_printf("CmpGT32Fx2"); return;
+      case Iop_CmpEQ32Fx2: vex_printf("CmpEQ32Fx2"); return;
+      case Iop_CmpGE32Fx2: vex_printf("CmpGE32Fx2"); return;
+
+      case Iop_CmpEQ32F0x4: vex_printf("CmpEQ32F0x4"); return;
+      case Iop_CmpLT32F0x4: vex_printf("CmpLT32F0x4"); return;
+      case Iop_CmpLE32F0x4: vex_printf("CmpLE32F0x4"); return;
+      case Iop_CmpUN32F0x4: vex_printf("CmpUN32F0x4"); return;
+      case Iop_CmpEQ64F0x2: vex_printf("CmpEQ64F0x2"); return;
+      case Iop_CmpLT64F0x2: vex_printf("CmpLT64F0x2"); return;
+      case Iop_CmpLE64F0x2: vex_printf("CmpLE64F0x2"); return;
+      case Iop_CmpUN64F0x2: vex_printf("CmpUN64F0x2"); return;
+
+      case Iop_Neg32Fx4: vex_printf("Neg32Fx4"); return;
+      case Iop_Neg32Fx2: vex_printf("Neg32Fx2"); return;
+
+      case Iop_V128to64:   vex_printf("V128to64");   return;
+      case Iop_V128HIto64: vex_printf("V128HIto64"); return;
+      case Iop_64HLtoV128: vex_printf("64HLtoV128"); return;
+
+      case Iop_64UtoV128:   vex_printf("64UtoV128"); return;
+      case Iop_SetV128lo64: vex_printf("SetV128lo64"); return;
+
+      case Iop_32UtoV128:   vex_printf("32UtoV128"); return;
+      case Iop_V128to32:    vex_printf("V128to32"); return;
+      case Iop_SetV128lo32: vex_printf("SetV128lo32"); return;
+
+      case Iop_Dup8x16: vex_printf("Dup8x16"); return;
+      case Iop_Dup16x8: vex_printf("Dup16x8"); return;
+      case Iop_Dup32x4: vex_printf("Dup32x4"); return;
+      case Iop_Dup8x8: vex_printf("Dup8x8"); return;
+      case Iop_Dup16x4: vex_printf("Dup16x4"); return;
+      case Iop_Dup32x2: vex_printf("Dup32x2"); return;
+
+      case Iop_NotV128:    vex_printf("NotV128"); return;
+      case Iop_AndV128:    vex_printf("AndV128"); return;
+      case Iop_OrV128:     vex_printf("OrV128");  return;
+      case Iop_XorV128:    vex_printf("XorV128"); return;
+
+      case Iop_CmpNEZ8x16: vex_printf("CmpNEZ8x16"); return;
+      case Iop_CmpNEZ16x8: vex_printf("CmpNEZ16x8"); return;
+      case Iop_CmpNEZ32x4: vex_printf("CmpNEZ32x4"); return;
+      case Iop_CmpNEZ64x2: vex_printf("CmpNEZ64x2"); return;
+
+      case Iop_Abs8x16: vex_printf("Abs8x16"); return;
+      case Iop_Abs16x8: vex_printf("Abs16x8"); return;
+      case Iop_Abs32x4: vex_printf("Abs32x4"); return;
+
+      case Iop_Add8x16:   vex_printf("Add8x16"); return;
+      case Iop_Add16x8:   vex_printf("Add16x8"); return;
+      case Iop_Add32x4:   vex_printf("Add32x4"); return;
+      case Iop_Add64x2:   vex_printf("Add64x2"); return;
+      case Iop_QAdd8Ux16: vex_printf("QAdd8Ux16"); return;
+      case Iop_QAdd16Ux8: vex_printf("QAdd16Ux8"); return;
+      case Iop_QAdd32Ux4: vex_printf("QAdd32Ux4"); return;
+      case Iop_QAdd8Sx16: vex_printf("QAdd8Sx16"); return;
+      case Iop_QAdd16Sx8: vex_printf("QAdd16Sx8"); return;
+      case Iop_QAdd32Sx4: vex_printf("QAdd32Sx4"); return;
+      case Iop_QAdd64Ux2: vex_printf("QAdd64Ux2"); return;
+      case Iop_QAdd64Sx2: vex_printf("QAdd64Sx2"); return;
+      case Iop_PwAdd8x16: vex_printf("PwAdd8x16"); return;
+      case Iop_PwAdd16x8: vex_printf("PwAdd16x8"); return;
+      case Iop_PwAdd32x4: vex_printf("PwAdd32x4"); return;
+      case Iop_PwAddL8Ux16: vex_printf("PwAddL8Ux16"); return;
+      case Iop_PwAddL16Ux8: vex_printf("PwAddL16Ux8"); return;
+      case Iop_PwAddL32Ux4: vex_printf("PwAddL32Ux4"); return;
+      case Iop_PwAddL8Sx16: vex_printf("PwAddL8Sx16"); return;
+      case Iop_PwAddL16Sx8: vex_printf("PwAddL16Sx8"); return;
+      case Iop_PwAddL32Sx4: vex_printf("PwAddL32Sx4"); return;
+
+      case Iop_Sub8x16:   vex_printf("Sub8x16"); return;
+      case Iop_Sub16x8:   vex_printf("Sub16x8"); return;
+      case Iop_Sub32x4:   vex_printf("Sub32x4"); return;
+      case Iop_Sub64x2:   vex_printf("Sub64x2"); return;
+      case Iop_QSub8Ux16: vex_printf("QSub8Ux16"); return;
+      case Iop_QSub16Ux8: vex_printf("QSub16Ux8"); return;
+      case Iop_QSub32Ux4: vex_printf("QSub32Ux4"); return;
+      case Iop_QSub8Sx16: vex_printf("QSub8Sx16"); return;
+      case Iop_QSub16Sx8: vex_printf("QSub16Sx8"); return;
+      case Iop_QSub32Sx4: vex_printf("QSub32Sx4"); return;
+      case Iop_QSub64Ux2: vex_printf("QSub64Ux2"); return;
+      case Iop_QSub64Sx2: vex_printf("QSub64Sx2"); return;
+
+      case Iop_Mul8x16:    vex_printf("Mul8x16"); return;
+      case Iop_Mul16x8:    vex_printf("Mul16x8"); return;
+      case Iop_Mul32x4:    vex_printf("Mul32x4"); return;
+      case Iop_Mull8Ux8:    vex_printf("Mull8Ux8"); return;
+      case Iop_Mull8Sx8:    vex_printf("Mull8Sx8"); return;
+      case Iop_Mull16Ux4:    vex_printf("Mull16Ux4"); return;
+      case Iop_Mull16Sx4:    vex_printf("Mull16Sx4"); return;
+      case Iop_Mull32Ux2:    vex_printf("Mull32Ux2"); return;
+      case Iop_Mull32Sx2:    vex_printf("Mull32Sx2"); return;
+      case Iop_PolynomialMul8x16: vex_printf("PolynomialMul8x16"); return;
+      case Iop_PolynomialMull8x8: vex_printf("PolynomialMull8x8"); return;
+      case Iop_MulHi16Ux8: vex_printf("MulHi16Ux8"); return;
+      case Iop_MulHi32Ux4: vex_printf("MulHi32Ux4"); return;
+      case Iop_MulHi16Sx8: vex_printf("MulHi16Sx8"); return;
+      case Iop_MulHi32Sx4: vex_printf("MulHi32Sx4"); return;
+      case Iop_QDMulHi16Sx8: vex_printf("QDMulHi16Sx8"); return;
+      case Iop_QDMulHi32Sx4: vex_printf("QDMulHi32Sx4"); return;
+      case Iop_QRDMulHi16Sx8: vex_printf("QRDMulHi16Sx8"); return;
+      case Iop_QRDMulHi32Sx4: vex_printf("QRDMulHi32Sx4"); return;
+
+      case Iop_MullEven8Ux16: vex_printf("MullEven8Ux16"); return;
+      case Iop_MullEven16Ux8: vex_printf("MullEven16Ux8"); return;
+      case Iop_MullEven8Sx16: vex_printf("MullEven8Sx16"); return;
+      case Iop_MullEven16Sx8: vex_printf("MullEven16Sx8"); return;
+
+      case Iop_Avg8Ux16: vex_printf("Avg8Ux16"); return;
+      case Iop_Avg16Ux8: vex_printf("Avg16Ux8"); return;
+      case Iop_Avg32Ux4: vex_printf("Avg32Ux4"); return;
+      case Iop_Avg8Sx16: vex_printf("Avg8Sx16"); return;
+      case Iop_Avg16Sx8: vex_printf("Avg16Sx8"); return;
+      case Iop_Avg32Sx4: vex_printf("Avg32Sx4"); return;
+
+      case Iop_Max8Sx16: vex_printf("Max8Sx16"); return;
+      case Iop_Max16Sx8: vex_printf("Max16Sx8"); return;
+      case Iop_Max32Sx4: vex_printf("Max32Sx4"); return;
+      case Iop_Max8Ux16: vex_printf("Max8Ux16"); return;
+      case Iop_Max16Ux8: vex_printf("Max16Ux8"); return;
+      case Iop_Max32Ux4: vex_printf("Max32Ux4"); return;
+
+      case Iop_Min8Sx16: vex_printf("Min8Sx16"); return;
+      case Iop_Min16Sx8: vex_printf("Min16Sx8"); return;
+      case Iop_Min32Sx4: vex_printf("Min32Sx4"); return;
+      case Iop_Min8Ux16: vex_printf("Min8Ux16"); return;
+      case Iop_Min16Ux8: vex_printf("Min16Ux8"); return;
+      case Iop_Min32Ux4: vex_printf("Min32Ux4"); return;
+
+      case Iop_CmpEQ8x16:  vex_printf("CmpEQ8x16"); return;
+      case Iop_CmpEQ16x8:  vex_printf("CmpEQ16x8"); return;
+      case Iop_CmpEQ32x4:  vex_printf("CmpEQ32x4"); return;
+      case Iop_CmpGT8Sx16: vex_printf("CmpGT8Sx16"); return;
+      case Iop_CmpGT16Sx8: vex_printf("CmpGT16Sx8"); return;
+      case Iop_CmpGT32Sx4: vex_printf("CmpGT32Sx4"); return;
+      case Iop_CmpGT64Sx2: vex_printf("CmpGT64Sx2"); return;
+      case Iop_CmpGT8Ux16: vex_printf("CmpGT8Ux16"); return;
+      case Iop_CmpGT16Ux8: vex_printf("CmpGT16Ux8"); return;
+      case Iop_CmpGT32Ux4: vex_printf("CmpGT32Ux4"); return;
+
+      case Iop_Cnt8x16: vex_printf("Cnt8x16"); return;
+      case Iop_Clz8Sx16: vex_printf("Clz8Sx16"); return;
+      case Iop_Clz16Sx8: vex_printf("Clz16Sx8"); return;
+      case Iop_Clz32Sx4: vex_printf("Clz32Sx4"); return;
+      case Iop_Cls8Sx16: vex_printf("Cls8Sx16"); return;
+      case Iop_Cls16Sx8: vex_printf("Cls16Sx8"); return;
+      case Iop_Cls32Sx4: vex_printf("Cls32Sx4"); return;
+
+      case Iop_ShlV128: vex_printf("ShlV128"); return;
+      case Iop_ShrV128: vex_printf("ShrV128"); return;
+
+      case Iop_ShlN8x16: vex_printf("ShlN8x16"); return;
+      case Iop_ShlN16x8: vex_printf("ShlN16x8"); return;
+      case Iop_ShlN32x4: vex_printf("ShlN32x4"); return;
+      case Iop_ShlN64x2: vex_printf("ShlN64x2"); return;
+      case Iop_ShrN8x16: vex_printf("ShrN8x16"); return;
+      case Iop_ShrN16x8: vex_printf("ShrN16x8"); return;
+      case Iop_ShrN32x4: vex_printf("ShrN32x4"); return;
+      case Iop_ShrN64x2: vex_printf("ShrN64x2"); return;
+      case Iop_SarN8x16: vex_printf("SarN8x16"); return;
+      case Iop_SarN16x8: vex_printf("SarN16x8"); return;
+      case Iop_SarN32x4: vex_printf("SarN32x4"); return;
+      case Iop_SarN64x2: vex_printf("SarN64x2"); return;
+
+      case Iop_Shl8x16: vex_printf("Shl8x16"); return;
+      case Iop_Shl16x8: vex_printf("Shl16x8"); return;
+      case Iop_Shl32x4: vex_printf("Shl32x4"); return;
+      case Iop_Shl64x2: vex_printf("Shl64x2"); return;
+      case Iop_QSal8x16: vex_printf("QSal8x16"); return;
+      case Iop_QSal16x8: vex_printf("QSal16x8"); return;
+      case Iop_QSal32x4: vex_printf("QSal32x4"); return;
+      case Iop_QSal64x2: vex_printf("QSal64x2"); return;
+      case Iop_QShl8x16: vex_printf("QShl8x16"); return;
+      case Iop_QShl16x8: vex_printf("QShl16x8"); return;
+      case Iop_QShl32x4: vex_printf("QShl32x4"); return;
+      case Iop_QShl64x2: vex_printf("QShl64x2"); return;
+      case Iop_QSalN8x16: vex_printf("QSalN8x16"); return;
+      case Iop_QSalN16x8: vex_printf("QSalN16x8"); return;
+      case Iop_QSalN32x4: vex_printf("QSalN32x4"); return;
+      case Iop_QSalN64x2: vex_printf("QSalN64x2"); return;
+      case Iop_QShlN8x16: vex_printf("QShlN8x16"); return;
+      case Iop_QShlN16x8: vex_printf("QShlN16x8"); return;
+      case Iop_QShlN32x4: vex_printf("QShlN32x4"); return;
+      case Iop_QShlN64x2: vex_printf("QShlN64x2"); return;
+      case Iop_QShlN8Sx16: vex_printf("QShlN8Sx16"); return;
+      case Iop_QShlN16Sx8: vex_printf("QShlN16Sx8"); return;
+      case Iop_QShlN32Sx4: vex_printf("QShlN32Sx4"); return;
+      case Iop_QShlN64Sx2: vex_printf("QShlN64Sx2"); return;
+      case Iop_Shr8x16: vex_printf("Shr8x16"); return;
+      case Iop_Shr16x8: vex_printf("Shr16x8"); return;
+      case Iop_Shr32x4: vex_printf("Shr32x4"); return;
+      case Iop_Shr64x2: vex_printf("Shr64x2"); return;
+      case Iop_Sar8x16: vex_printf("Sar8x16"); return;
+      case Iop_Sar16x8: vex_printf("Sar16x8"); return;
+      case Iop_Sar32x4: vex_printf("Sar32x4"); return;
+      case Iop_Sar64x2: vex_printf("Sar64x2"); return;
+      case Iop_Sal8x16: vex_printf("Sal8x16"); return;
+      case Iop_Sal16x8: vex_printf("Sal16x8"); return;
+      case Iop_Sal32x4: vex_printf("Sal32x4"); return;
+      case Iop_Sal64x2: vex_printf("Sal64x2"); return;
+      case Iop_Rol8x16: vex_printf("Rol8x16"); return;
+      case Iop_Rol16x8: vex_printf("Rol16x8"); return;
+      case Iop_Rol32x4: vex_printf("Rol32x4"); return;
+
+      case Iop_Narrow16x8:   vex_printf("Narrow16x8"); return;
+      case Iop_Narrow32x4:   vex_printf("Narrow32x4"); return;
+      case Iop_QNarrow16Ux8: vex_printf("QNarrow16Ux8"); return;
+      case Iop_QNarrow32Ux4: vex_printf("QNarrow32Ux4"); return;
+      case Iop_QNarrow16Sx8: vex_printf("QNarrow16Sx8"); return;
+      case Iop_QNarrow32Sx4: vex_printf("QNarrow32Sx4"); return;
+      case Iop_Shorten16x8: vex_printf("Shorten16x8"); return;
+      case Iop_Shorten32x4: vex_printf("Shorten32x4"); return;
+      case Iop_Shorten64x2: vex_printf("Shorten64x2"); return;
+      case Iop_QShortenU16Ux8: vex_printf("QShortenU16Ux8"); return;
+      case Iop_QShortenU32Ux4: vex_printf("QShortenU32Ux4"); return;
+      case Iop_QShortenU64Ux2: vex_printf("QShortenU64Ux2"); return;
+      case Iop_QShortenS16Sx8: vex_printf("QShortenS16Sx8"); return;
+      case Iop_QShortenS32Sx4: vex_printf("QShortenS32Sx4"); return;
+      case Iop_QShortenS64Sx2: vex_printf("QShortenS64Sx2"); return;
+      case Iop_QShortenU16Sx8: vex_printf("QShortenU16Sx8"); return;
+      case Iop_QShortenU32Sx4: vex_printf("QShortenU32Sx4"); return;
+      case Iop_QShortenU64Sx2: vex_printf("QShortenU64Sx2"); return;
+      case Iop_Longen8Ux8: vex_printf("Longen8Ux8"); return;
+      case Iop_Longen16Ux4: vex_printf("Longen16Ux4"); return;
+      case Iop_Longen32Ux2: vex_printf("Longen32Ux2"); return;
+      case Iop_Longen8Sx8: vex_printf("Longen8Sx8"); return;
+      case Iop_Longen16Sx4: vex_printf("Longen16Sx4"); return;
+      case Iop_Longen32Sx2: vex_printf("Longen32Sx2"); return;
+
+      case Iop_InterleaveHI8x16: vex_printf("InterleaveHI8x16"); return;
+      case Iop_InterleaveHI16x8: vex_printf("InterleaveHI16x8"); return;
+      case Iop_InterleaveHI32x4: vex_printf("InterleaveHI32x4"); return;
+      case Iop_InterleaveHI64x2: vex_printf("InterleaveHI64x2"); return;
+      case Iop_InterleaveLO8x16: vex_printf("InterleaveLO8x16"); return;
+      case Iop_InterleaveLO16x8: vex_printf("InterleaveLO16x8"); return;
+      case Iop_InterleaveLO32x4: vex_printf("InterleaveLO32x4"); return;
+      case Iop_InterleaveLO64x2: vex_printf("InterleaveLO64x2"); return;
+
+      case Iop_CatOddLanes8x16: vex_printf("CatOddLanes8x16"); return;
+      case Iop_CatOddLanes16x8: vex_printf("CatOddLanes16x8"); return;
+      case Iop_CatOddLanes32x4: vex_printf("CatOddLanes32x4"); return;
+      case Iop_CatEvenLanes8x16: vex_printf("CatEvenLanes8x16"); return;
+      case Iop_CatEvenLanes16x8: vex_printf("CatEvenLanes16x8"); return;
+      case Iop_CatEvenLanes32x4: vex_printf("CatEvenLanes32x4"); return;
+
+      case Iop_InterleaveOddLanes8x16: vex_printf("InterleaveOddLanes8x16"); return;
+      case Iop_InterleaveOddLanes16x8: vex_printf("InterleaveOddLanes16x8"); return;
+      case Iop_InterleaveOddLanes32x4: vex_printf("InterleaveOddLanes32x4"); return;
+      case Iop_InterleaveEvenLanes8x16: vex_printf("InterleaveEvenLanes8x16"); return;
+      case Iop_InterleaveEvenLanes16x8: vex_printf("InterleaveEvenLanes16x8"); return;
+      case Iop_InterleaveEvenLanes32x4: vex_printf("InterleaveEvenLanes32x4"); return;
+
+      case Iop_GetElem8x16: vex_printf("GetElem8x16"); return;
+      case Iop_GetElem16x8: vex_printf("GetElem16x8"); return;
+      case Iop_GetElem32x4: vex_printf("GetElem32x4"); return;
+      case Iop_GetElem64x2: vex_printf("GetElem64x2"); return;
+
+      case Iop_GetElem8x8: vex_printf("GetElem8x8"); return;
+      case Iop_GetElem16x4: vex_printf("GetElem16x4"); return;
+      case Iop_GetElem32x2: vex_printf("GetElem32x2"); return;
+      case Iop_SetElem8x8: vex_printf("SetElem8x8"); return;
+      case Iop_SetElem16x4: vex_printf("SetElem16x4"); return;
+      case Iop_SetElem32x2: vex_printf("SetElem32x2"); return;
+
+      case Iop_Extract64: vex_printf("Extract64"); return;
+      case Iop_ExtractV128: vex_printf("ExtractV128"); return;
+
+      case Iop_Perm8x16: vex_printf("Perm8x16"); return;
+      case Iop_Reverse16_8x16: vex_printf("Reverse16_8x16"); return;
+      case Iop_Reverse32_8x16: vex_printf("Reverse32_8x16"); return;
+      case Iop_Reverse32_16x8: vex_printf("Reverse32_16x8"); return;
+      case Iop_Reverse64_8x16: vex_printf("Reverse64_8x16"); return;
+      case Iop_Reverse64_16x8: vex_printf("Reverse64_16x8"); return;
+      case Iop_Reverse64_32x4: vex_printf("Reverse64_32x4"); return;
+
+      case Iop_F32ToFixed32Ux4_RZ: vex_printf("F32ToFixed32Ux4_RZ"); return;
+      case Iop_F32ToFixed32Sx4_RZ: vex_printf("F32ToFixed32Sx4_RZ"); return;
+      case Iop_Fixed32UToF32x4_RN: vex_printf("Fixed32UToF32x4_RN"); return;
+      case Iop_Fixed32SToF32x4_RN: vex_printf("Fixed32SToF32x4_RN"); return;
+      case Iop_F32ToFixed32Ux2_RZ: vex_printf("F32ToFixed32Ux2_RZ"); return;
+      case Iop_F32ToFixed32Sx2_RZ: vex_printf("F32ToFixed32Sx2_RZ"); return;
+      case Iop_Fixed32UToF32x2_RN: vex_printf("Fixed32UToF32x2_RN"); return;
+      case Iop_Fixed32SToF32x2_RN: vex_printf("Fixed32SToF32x2_RN"); return;
+
+      default: vpanic("ppIROp(1)");
+   }
+
+   vassert(str);  
+   switch (op - base) {
+      case 0: vex_printf("%s",str); vex_printf("8"); break;
+      case 1: vex_printf("%s",str); vex_printf("16"); break;
+      case 2: vex_printf("%s",str); vex_printf("32"); break;
+      case 3: vex_printf("%s",str); vex_printf("64"); break;
+      default: vpanic("ppIROp(2)");
+   }
+}
+
+void ppIRExpr ( IRExpr* e )
+{
+  Int i;
+  switch (e->tag) {
+    case Iex_Binder:
+      vex_printf("BIND-%d", e->Iex.Binder.binder);
+      break;
+    case Iex_Get:
+      vex_printf( "GET:" );
+      ppIRType(e->Iex.Get.ty);
+      vex_printf("(%d)", e->Iex.Get.offset);
+      break;
+    case Iex_GetI:
+      vex_printf( "GETI" );
+      ppIRRegArray(e->Iex.GetI.descr);
+      vex_printf("[");
+      ppIRExpr(e->Iex.GetI.ix);
+      vex_printf(",%d]", e->Iex.GetI.bias);
+      break;
+    case Iex_RdTmp:
+      ppIRTemp(e->Iex.RdTmp.tmp);
+      break;
+    case Iex_Qop:
+      ppIROp(e->Iex.Qop.op);
+      vex_printf( "(" );
+      ppIRExpr(e->Iex.Qop.arg1);
+      vex_printf( "," );
+      ppIRExpr(e->Iex.Qop.arg2);
+      vex_printf( "," );
+      ppIRExpr(e->Iex.Qop.arg3);
+      vex_printf( "," );
+      ppIRExpr(e->Iex.Qop.arg4);
+      vex_printf( ")" );
+      break;
+    case Iex_Triop:
+      ppIROp(e->Iex.Triop.op);
+      vex_printf( "(" );
+      ppIRExpr(e->Iex.Triop.arg1);
+      vex_printf( "," );
+      ppIRExpr(e->Iex.Triop.arg2);
+      vex_printf( "," );
+      ppIRExpr(e->Iex.Triop.arg3);
+      vex_printf( ")" );
+      break;
+    case Iex_Binop:
+      ppIROp(e->Iex.Binop.op);
+      vex_printf( "(" );
+      ppIRExpr(e->Iex.Binop.arg1);
+      vex_printf( "," );
+      ppIRExpr(e->Iex.Binop.arg2);
+      vex_printf( ")" );
+      break;
+    case Iex_Unop:
+      ppIROp(e->Iex.Unop.op);
+      vex_printf( "(" );
+      ppIRExpr(e->Iex.Unop.arg);
+      vex_printf( ")" );
+      break;
+    case Iex_Load:
+      vex_printf( "LD%s:", e->Iex.Load.end==Iend_LE ? "le" : "be" );
+      ppIRType(e->Iex.Load.ty);
+      vex_printf( "(" );
+      ppIRExpr(e->Iex.Load.addr);
+      vex_printf( ")" );
+      break;
+    case Iex_Const:
+      ppIRConst(e->Iex.Const.con);
+      break;
+    case Iex_CCall:
+      ppIRCallee(e->Iex.CCall.cee);
+      vex_printf("(");
+      for (i = 0; e->Iex.CCall.args[i] != NULL; i++) {
+        ppIRExpr(e->Iex.CCall.args[i]);
+        if (e->Iex.CCall.args[i+1] != NULL)
+          vex_printf(",");
+      }
+      vex_printf("):");
+      ppIRType(e->Iex.CCall.retty);
+      break;
+    case Iex_Mux0X:
+      vex_printf("Mux0X(");
+      ppIRExpr(e->Iex.Mux0X.cond);
+      vex_printf(",");
+      ppIRExpr(e->Iex.Mux0X.expr0);
+      vex_printf(",");
+      ppIRExpr(e->Iex.Mux0X.exprX);
+      vex_printf(")");
+      break;
+    default:
+      vpanic("ppIRExpr");
+  }
+}
+
+void ppIREffect ( IREffect fx )
+{
+   switch (fx) {
+      case Ifx_None:   vex_printf("noFX"); return;
+      case Ifx_Read:   vex_printf("RdFX"); return;
+      case Ifx_Write:  vex_printf("WrFX"); return;
+      case Ifx_Modify: vex_printf("MoFX"); return;
+      default: vpanic("ppIREffect");
+   }
+}
+
+void ppIRDirty ( IRDirty* d )
+{
+   Int i;
+   if (d->tmp != IRTemp_INVALID) {
+      ppIRTemp(d->tmp);
+      vex_printf(" = ");
+   }
+   vex_printf("DIRTY ");
+   ppIRExpr(d->guard);
+   if (d->needsBBP)
+      vex_printf(" NeedsBBP");
+   if (d->mFx != Ifx_None) {
+      vex_printf(" ");
+      ppIREffect(d->mFx);
+      vex_printf("-mem(");
+      ppIRExpr(d->mAddr);
+      vex_printf(",%d)", d->mSize);
+   }
+   for (i = 0; i < d->nFxState; i++) {
+      vex_printf(" ");
+      ppIREffect(d->fxState[i].fx);
+      vex_printf("-gst(%d,%d)", d->fxState[i].offset, d->fxState[i].size);
+   }
+   vex_printf(" ::: ");
+   ppIRCallee(d->cee);
+   vex_printf("(");
+   for (i = 0; d->args[i] != NULL; i++) {
+      ppIRExpr(d->args[i]);
+      if (d->args[i+1] != NULL) {
+         vex_printf(",");
+      }
+   }
+   vex_printf(")");
+}
+
+void ppIRCAS ( IRCAS* cas )
+{
+   /* Print even structurally invalid constructions, as an aid to
+      debugging. */
+   if (cas->oldHi != IRTemp_INVALID) {
+      ppIRTemp(cas->oldHi);
+      vex_printf(",");
+   }
+   ppIRTemp(cas->oldLo);
+   vex_printf(" = CAS%s(", cas->end==Iend_LE ? "le" : "be" );
+   ppIRExpr(cas->addr);
+   vex_printf("::");
+   if (cas->expdHi) {
+      ppIRExpr(cas->expdHi);
+      vex_printf(",");
+   }
+   ppIRExpr(cas->expdLo);
+   vex_printf("->");
+   if (cas->dataHi) {
+      ppIRExpr(cas->dataHi);
+      vex_printf(",");
+   }
+   ppIRExpr(cas->dataLo);
+   vex_printf(")");
+}
+
+void ppIRJumpKind ( IRJumpKind kind )
+{
+   switch (kind) {
+      case Ijk_Boring:       vex_printf("Boring"); break;
+      case Ijk_Call:         vex_printf("Call"); break;
+      case Ijk_Ret:          vex_printf("Return"); break;
+      case Ijk_ClientReq:    vex_printf("ClientReq"); break;
+      case Ijk_Yield:        vex_printf("Yield"); break;
+      case Ijk_EmWarn:       vex_printf("EmWarn"); break;
+      case Ijk_EmFail:       vex_printf("EmFail"); break;
+      case Ijk_NoDecode:     vex_printf("NoDecode"); break;
+      case Ijk_MapFail:      vex_printf("MapFail"); break;
+      case Ijk_TInval:       vex_printf("Invalidate"); break;
+      case Ijk_NoRedir:      vex_printf("NoRedir"); break;
+      case Ijk_SigTRAP:      vex_printf("SigTRAP"); break;
+      case Ijk_SigSEGV:      vex_printf("SigSEGV"); break;
+      case Ijk_SigBUS:       vex_printf("SigBUS"); break;
+      case Ijk_Sys_syscall:  vex_printf("Sys_syscall"); break;
+      case Ijk_Sys_int32:    vex_printf("Sys_int32"); break;
+      case Ijk_Sys_int128:   vex_printf("Sys_int128"); break;
+      case Ijk_Sys_int129:   vex_printf("Sys_int129"); break;
+      case Ijk_Sys_int130:   vex_printf("Sys_int130"); break;
+      case Ijk_Sys_sysenter: vex_printf("Sys_sysenter"); break;
+      default:               vpanic("ppIRJumpKind");
+   }
+}
+
+void ppIRMBusEvent ( IRMBusEvent event )
+{
+   switch (event) {
+      case Imbe_Fence: vex_printf("Fence"); break;
+      default:         vpanic("ppIRMBusEvent");
+   }
+}
+
+void ppIRStmt ( IRStmt* s )
+{
+   if (!s) {
+      vex_printf("!!! IRStmt* which is NULL !!!");
+      return;
+   }
+   switch (s->tag) {
+      case Ist_NoOp:
+         vex_printf("IR-NoOp");
+         break;
+      case Ist_IMark:
+         vex_printf( "------ IMark(0x%llx, %d) ------", 
+                     s->Ist.IMark.addr, s->Ist.IMark.len);
+         break;
+      case Ist_AbiHint:
+         vex_printf("====== AbiHint(");
+         ppIRExpr(s->Ist.AbiHint.base);
+         vex_printf(", %d, ", s->Ist.AbiHint.len);
+         ppIRExpr(s->Ist.AbiHint.nia);
+         vex_printf(") ======");
+         break;
+      case Ist_Put:
+         vex_printf( "PUT(%d) = ", s->Ist.Put.offset);
+         ppIRExpr(s->Ist.Put.data);
+         break;
+      case Ist_PutI:
+         vex_printf( "PUTI" );
+         ppIRRegArray(s->Ist.PutI.descr);
+         vex_printf("[");
+         ppIRExpr(s->Ist.PutI.ix);
+         vex_printf(",%d] = ", s->Ist.PutI.bias);
+         ppIRExpr(s->Ist.PutI.data);
+         break;
+      case Ist_WrTmp:
+         ppIRTemp(s->Ist.WrTmp.tmp);
+         vex_printf( " = " );
+         ppIRExpr(s->Ist.WrTmp.data);
+         break;
+      case Ist_Store:
+         vex_printf( "ST%s(", s->Ist.Store.end==Iend_LE ? "le" : "be" );
+         ppIRExpr(s->Ist.Store.addr);
+         vex_printf( ") = ");
+         ppIRExpr(s->Ist.Store.data);
+         break;
+      case Ist_CAS:
+         ppIRCAS(s->Ist.CAS.details);
+         break;
+      case Ist_LLSC:
+         if (s->Ist.LLSC.storedata == NULL) {
+            ppIRTemp(s->Ist.LLSC.result);
+            vex_printf(" = LD%s-Linked(",
+                       s->Ist.LLSC.end==Iend_LE ? "le" : "be");
+            ppIRExpr(s->Ist.LLSC.addr);
+            vex_printf(")");
+         } else {
+            ppIRTemp(s->Ist.LLSC.result);
+            vex_printf(" = ( ST%s-Cond(",
+                       s->Ist.LLSC.end==Iend_LE ? "le" : "be");
+            ppIRExpr(s->Ist.LLSC.addr);
+            vex_printf(") = ");
+            ppIRExpr(s->Ist.LLSC.storedata);
+            vex_printf(" )");
+         }
+         break;
+      case Ist_Dirty:
+         ppIRDirty(s->Ist.Dirty.details);
+         break;
+      case Ist_MBE:
+         vex_printf("IR-");
+         ppIRMBusEvent(s->Ist.MBE.event);
+         break;
+      case Ist_Exit:
+         vex_printf( "if (" );
+         ppIRExpr(s->Ist.Exit.guard);
+         vex_printf( ") goto {");
+         ppIRJumpKind(s->Ist.Exit.jk);
+         vex_printf("} ");
+         ppIRConst(s->Ist.Exit.dst);
+         break;
+      default: 
+         vpanic("ppIRStmt");
+   }
+}
+
+void ppIRTypeEnv ( IRTypeEnv* env ) {
+   UInt i;
+   for (i = 0; i < env->types_used; i++) {
+      if (i % 8 == 0)
+         vex_printf( "   ");
+      ppIRTemp(i);
+      vex_printf( ":");
+      ppIRType(env->types[i]);
+      if (i % 8 == 7) 
+         vex_printf( "\n"); 
+      else 
+         vex_printf( "   ");
+   }
+   if (env->types_used > 0 && env->types_used % 8 != 7) 
+      vex_printf( "\n"); 
+}
+
+void ppIRSB ( IRSB* bb )
+{
+   Int i;
+   vex_printf("IRSB {\n");
+   ppIRTypeEnv(bb->tyenv);
+   vex_printf("\n");
+   for (i = 0; i < bb->stmts_used; i++) {
+      vex_printf( "   ");
+      ppIRStmt(bb->stmts[i]);
+      vex_printf( "\n");
+   }
+   vex_printf( "   goto {");
+   ppIRJumpKind(bb->jumpkind);
+   vex_printf( "} ");
+   ppIRExpr( bb->next );
+   vex_printf( "\n}\n");
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Constructors                                            ---*/
+/*---------------------------------------------------------------*/
+
+
+/* Constructors -- IRConst */
+
+IRConst* IRConst_U1 ( Bool bit )
+{
+   IRConst* c = LibVEX_Alloc(sizeof(IRConst));
+   c->tag     = Ico_U1;
+   c->Ico.U1  = bit;
+   /* call me paranoid; I don't care :-) */
+   vassert(bit == False || bit == True);
+   return c;
+}
+IRConst* IRConst_U8 ( UChar u8 )
+{
+   IRConst* c = LibVEX_Alloc(sizeof(IRConst));
+   c->tag     = Ico_U8;
+   c->Ico.U8  = u8;
+   return c;
+}
+IRConst* IRConst_U16 ( UShort u16 )
+{
+   IRConst* c = LibVEX_Alloc(sizeof(IRConst));
+   c->tag     = Ico_U16;
+   c->Ico.U16 = u16;
+   return c;
+}
+IRConst* IRConst_U32 ( UInt u32 )
+{
+   IRConst* c = LibVEX_Alloc(sizeof(IRConst));
+   c->tag     = Ico_U32;
+   c->Ico.U32 = u32;
+   return c;
+}
+IRConst* IRConst_U64 ( ULong u64 )
+{
+   IRConst* c = LibVEX_Alloc(sizeof(IRConst));
+   c->tag     = Ico_U64;
+   c->Ico.U64 = u64;
+   return c;
+}
+IRConst* IRConst_F64 ( Double f64 )
+{
+   IRConst* c = LibVEX_Alloc(sizeof(IRConst));
+   c->tag     = Ico_F64;
+   c->Ico.F64 = f64;
+   return c;
+}
+IRConst* IRConst_F64i ( ULong f64i )
+{
+   IRConst* c  = LibVEX_Alloc(sizeof(IRConst));
+   c->tag      = Ico_F64i;
+   c->Ico.F64i = f64i;
+   return c;
+}
+IRConst* IRConst_V128 ( UShort con )
+{
+   IRConst* c  = LibVEX_Alloc(sizeof(IRConst));
+   c->tag      = Ico_V128;
+   c->Ico.V128 = con;
+   return c;
+}
+
+/* Constructors -- IRCallee */
+
+IRCallee* mkIRCallee ( Int regparms, HChar* name, void* addr )
+{
+   IRCallee* ce = LibVEX_Alloc(sizeof(IRCallee));
+   ce->regparms = regparms;
+   ce->name     = name;
+   ce->addr     = addr;
+   ce->mcx_mask = 0;
+   vassert(regparms >= 0 && regparms <= 3);
+   vassert(name != NULL);
+   vassert(addr != 0);
+   return ce;
+}
+
+
+/* Constructors -- IRRegArray */
+
+IRRegArray* mkIRRegArray ( Int base, IRType elemTy, Int nElems )
+{
+   IRRegArray* arr = LibVEX_Alloc(sizeof(IRRegArray));
+   arr->base       = base;
+   arr->elemTy     = elemTy;
+   arr->nElems     = nElems;
+   vassert(!(arr->base < 0 || arr->base > 10000 /* somewhat arbitrary */));
+   vassert(!(arr->elemTy == Ity_I1));
+   vassert(!(arr->nElems <= 0 || arr->nElems > 500 /* somewhat arbitrary */));
+   return arr;
+}
+
+
+/* Constructors -- IRExpr */
+
+IRExpr* IRExpr_Binder ( Int binder ) {
+   IRExpr* e            = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag               = Iex_Binder;
+   e->Iex.Binder.binder = binder;
+   return e;
+}
+IRExpr* IRExpr_Get ( Int off, IRType ty ) {
+   IRExpr* e         = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag            = Iex_Get;
+   e->Iex.Get.offset = off;
+   e->Iex.Get.ty     = ty;
+   return e;
+}
+IRExpr* IRExpr_GetI ( IRRegArray* descr, IRExpr* ix, Int bias ) {
+   IRExpr* e         = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag            = Iex_GetI;
+   e->Iex.GetI.descr = descr;
+   e->Iex.GetI.ix    = ix;
+   e->Iex.GetI.bias  = bias;
+   return e;
+}
+IRExpr* IRExpr_RdTmp ( IRTemp tmp ) {
+   IRExpr* e        = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag           = Iex_RdTmp;
+   e->Iex.RdTmp.tmp = tmp;
+   return e;
+}
+IRExpr* IRExpr_Qop ( IROp op, IRExpr* arg1, IRExpr* arg2, 
+                              IRExpr* arg3, IRExpr* arg4 ) {
+   IRExpr* e       = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag          = Iex_Qop;
+   e->Iex.Qop.op   = op;
+   e->Iex.Qop.arg1 = arg1;
+   e->Iex.Qop.arg2 = arg2;
+   e->Iex.Qop.arg3 = arg3;
+   e->Iex.Qop.arg4 = arg4;
+   return e;
+}
+IRExpr* IRExpr_Triop  ( IROp op, IRExpr* arg1, 
+                                 IRExpr* arg2, IRExpr* arg3 ) {
+   IRExpr* e         = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag            = Iex_Triop;
+   e->Iex.Triop.op   = op;
+   e->Iex.Triop.arg1 = arg1;
+   e->Iex.Triop.arg2 = arg2;
+   e->Iex.Triop.arg3 = arg3;
+   return e;
+}
+IRExpr* IRExpr_Binop ( IROp op, IRExpr* arg1, IRExpr* arg2 ) {
+   IRExpr* e         = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag            = Iex_Binop;
+   e->Iex.Binop.op   = op;
+   e->Iex.Binop.arg1 = arg1;
+   e->Iex.Binop.arg2 = arg2;
+   return e;
+}
+IRExpr* IRExpr_Unop ( IROp op, IRExpr* arg ) {
+   IRExpr* e       = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag          = Iex_Unop;
+   e->Iex.Unop.op  = op;
+   e->Iex.Unop.arg = arg;
+   return e;
+}
+IRExpr* IRExpr_Load ( IREndness end, IRType ty, IRExpr* addr ) {
+   IRExpr* e        = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag           = Iex_Load;
+   e->Iex.Load.end  = end;
+   e->Iex.Load.ty   = ty;
+   e->Iex.Load.addr = addr;
+   vassert(end == Iend_LE || end == Iend_BE);
+   return e;
+}
+IRExpr* IRExpr_Const ( IRConst* con ) {
+   IRExpr* e        = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag           = Iex_Const;
+   e->Iex.Const.con = con;
+   return e;
+}
+IRExpr* IRExpr_CCall ( IRCallee* cee, IRType retty, IRExpr** args ) {
+   IRExpr* e          = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag             = Iex_CCall;
+   e->Iex.CCall.cee   = cee;
+   e->Iex.CCall.retty = retty;
+   e->Iex.CCall.args  = args;
+   return e;
+}
+IRExpr* IRExpr_Mux0X ( IRExpr* cond, IRExpr* expr0, IRExpr* exprX ) {
+   IRExpr* e          = LibVEX_Alloc(sizeof(IRExpr));
+   e->tag             = Iex_Mux0X;
+   e->Iex.Mux0X.cond  = cond;
+   e->Iex.Mux0X.expr0 = expr0;
+   e->Iex.Mux0X.exprX = exprX;
+   return e;
+}
+
+
+/* Constructors for NULL-terminated IRExpr expression vectors,
+   suitable for use as arg lists in clean/dirty helper calls. */
+
+IRExpr** mkIRExprVec_0 ( void ) {
+   IRExpr** vec = LibVEX_Alloc(1 * sizeof(IRExpr*));
+   vec[0] = NULL;
+   return vec;
+}
+IRExpr** mkIRExprVec_1 ( IRExpr* arg1 ) {
+   IRExpr** vec = LibVEX_Alloc(2 * sizeof(IRExpr*));
+   vec[0] = arg1;
+   vec[1] = NULL;
+   return vec;
+}
+IRExpr** mkIRExprVec_2 ( IRExpr* arg1, IRExpr* arg2 ) {
+   IRExpr** vec = LibVEX_Alloc(3 * sizeof(IRExpr*));
+   vec[0] = arg1;
+   vec[1] = arg2;
+   vec[2] = NULL;
+   return vec;
+}
+IRExpr** mkIRExprVec_3 ( IRExpr* arg1, IRExpr* arg2, IRExpr* arg3 ) {
+   IRExpr** vec = LibVEX_Alloc(4 * sizeof(IRExpr*));
+   vec[0] = arg1;
+   vec[1] = arg2;
+   vec[2] = arg3;
+   vec[3] = NULL;
+   return vec;
+}
+IRExpr** mkIRExprVec_4 ( IRExpr* arg1, IRExpr* arg2, IRExpr* arg3,
+                         IRExpr* arg4 ) {
+   IRExpr** vec = LibVEX_Alloc(5 * sizeof(IRExpr*));
+   vec[0] = arg1;
+   vec[1] = arg2;
+   vec[2] = arg3;
+   vec[3] = arg4;
+   vec[4] = NULL;
+   return vec;
+}
+IRExpr** mkIRExprVec_5 ( IRExpr* arg1, IRExpr* arg2, IRExpr* arg3,
+                         IRExpr* arg4, IRExpr* arg5 ) {
+   IRExpr** vec = LibVEX_Alloc(6 * sizeof(IRExpr*));
+   vec[0] = arg1;
+   vec[1] = arg2;
+   vec[2] = arg3;
+   vec[3] = arg4;
+   vec[4] = arg5;
+   vec[5] = NULL;
+   return vec;
+}
+IRExpr** mkIRExprVec_6 ( IRExpr* arg1, IRExpr* arg2, IRExpr* arg3,
+                         IRExpr* arg4, IRExpr* arg5, IRExpr* arg6 ) {
+   IRExpr** vec = LibVEX_Alloc(7 * sizeof(IRExpr*));
+   vec[0] = arg1;
+   vec[1] = arg2;
+   vec[2] = arg3;
+   vec[3] = arg4;
+   vec[4] = arg5;
+   vec[5] = arg6;
+   vec[6] = NULL;
+   return vec;
+}
+IRExpr** mkIRExprVec_7 ( IRExpr* arg1, IRExpr* arg2, IRExpr* arg3,
+                         IRExpr* arg4, IRExpr* arg5, IRExpr* arg6,
+                         IRExpr* arg7 ) {
+   IRExpr** vec = LibVEX_Alloc(8 * sizeof(IRExpr*));
+   vec[0] = arg1;
+   vec[1] = arg2;
+   vec[2] = arg3;
+   vec[3] = arg4;
+   vec[4] = arg5;
+   vec[5] = arg6;
+   vec[6] = arg7;
+   vec[7] = NULL;
+   return vec;
+}
+IRExpr** mkIRExprVec_8 ( IRExpr* arg1, IRExpr* arg2, IRExpr* arg3,
+                         IRExpr* arg4, IRExpr* arg5, IRExpr* arg6,
+                         IRExpr* arg7, IRExpr* arg8 ) {
+   IRExpr** vec = LibVEX_Alloc(9 * sizeof(IRExpr*));
+   vec[0] = arg1;
+   vec[1] = arg2;
+   vec[2] = arg3;
+   vec[3] = arg4;
+   vec[4] = arg5;
+   vec[5] = arg6;
+   vec[6] = arg7;
+   vec[7] = arg8;
+   vec[8] = NULL;
+   return vec;
+}
+
+
+/* Constructors -- IRDirty */
+
+IRDirty* emptyIRDirty ( void ) {
+   IRDirty* d = LibVEX_Alloc(sizeof(IRDirty));
+   d->cee      = NULL;
+   d->guard    = NULL;
+   d->args     = NULL;
+   d->tmp      = IRTemp_INVALID;
+   d->mFx      = Ifx_None;
+   d->mAddr    = NULL;
+   d->mSize    = 0;
+   d->needsBBP = False;
+   d->nFxState = 0;
+   return d;
+}
+
+
+/* Constructors -- IRCAS */
+
+IRCAS* mkIRCAS ( IRTemp oldHi, IRTemp oldLo,
+                 IREndness end, IRExpr* addr, 
+                 IRExpr* expdHi, IRExpr* expdLo,
+                 IRExpr* dataHi, IRExpr* dataLo ) {
+   IRCAS* cas = LibVEX_Alloc(sizeof(IRCAS));
+   cas->oldHi  = oldHi;
+   cas->oldLo  = oldLo;
+   cas->end    = end;
+   cas->addr   = addr;
+   cas->expdHi = expdHi;
+   cas->expdLo = expdLo;
+   cas->dataHi = dataHi;
+   cas->dataLo = dataLo;
+   return cas;
+}
+
+
+/* Constructors -- IRStmt */
+
+IRStmt* IRStmt_NoOp ( void )
+{
+   /* Just use a single static closure. */
+   static IRStmt static_closure;
+   static_closure.tag = Ist_NoOp;
+   return &static_closure;
+}
+IRStmt* IRStmt_IMark ( Addr64 addr, Int len ) {
+   IRStmt* s         = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag            = Ist_IMark;
+   s->Ist.IMark.addr = addr;
+   s->Ist.IMark.len  = len;
+   return s;
+}
+IRStmt* IRStmt_AbiHint ( IRExpr* base, Int len, IRExpr* nia ) {
+   IRStmt* s           = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag              = Ist_AbiHint;
+   s->Ist.AbiHint.base = base;
+   s->Ist.AbiHint.len  = len;
+   s->Ist.AbiHint.nia  = nia;
+   return s;
+}
+IRStmt* IRStmt_Put ( Int off, IRExpr* data ) {
+   IRStmt* s         = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag            = Ist_Put;
+   s->Ist.Put.offset = off;
+   s->Ist.Put.data   = data;
+   return s;
+}
+IRStmt* IRStmt_PutI ( IRRegArray* descr, IRExpr* ix,
+                      Int bias, IRExpr* data ) {
+   IRStmt* s         = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag            = Ist_PutI;
+   s->Ist.PutI.descr = descr;
+   s->Ist.PutI.ix    = ix;
+   s->Ist.PutI.bias  = bias;
+   s->Ist.PutI.data  = data;
+   return s;
+}
+IRStmt* IRStmt_WrTmp ( IRTemp tmp, IRExpr* data ) {
+   IRStmt* s         = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag            = Ist_WrTmp;
+   s->Ist.WrTmp.tmp  = tmp;
+   s->Ist.WrTmp.data = data;
+   return s;
+}
+IRStmt* IRStmt_Store ( IREndness end, IRExpr* addr, IRExpr* data ) {
+   IRStmt* s         = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag            = Ist_Store;
+   s->Ist.Store.end  = end;
+   s->Ist.Store.addr = addr;
+   s->Ist.Store.data = data;
+   vassert(end == Iend_LE || end == Iend_BE);
+   return s;
+}
+IRStmt* IRStmt_CAS ( IRCAS* cas ) {
+   IRStmt* s          = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag             = Ist_CAS;
+   s->Ist.CAS.details = cas;
+   return s;
+}
+IRStmt* IRStmt_LLSC ( IREndness end,
+                      IRTemp result, IRExpr* addr, IRExpr* storedata ) {
+   IRStmt* s = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag                = Ist_LLSC;
+   s->Ist.LLSC.end       = end;
+   s->Ist.LLSC.result    = result;
+   s->Ist.LLSC.addr      = addr;
+   s->Ist.LLSC.storedata = storedata;
+   return s;
+}
+IRStmt* IRStmt_Dirty ( IRDirty* d )
+{
+   IRStmt* s            = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag               = Ist_Dirty;
+   s->Ist.Dirty.details = d;
+   return s;
+}
+IRStmt* IRStmt_MBE ( IRMBusEvent event )
+{
+   IRStmt* s        = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag           = Ist_MBE;
+   s->Ist.MBE.event = event;
+   return s;
+}
+IRStmt* IRStmt_Exit ( IRExpr* guard, IRJumpKind jk, IRConst* dst ) {
+   IRStmt* s         = LibVEX_Alloc(sizeof(IRStmt));
+   s->tag            = Ist_Exit;
+   s->Ist.Exit.guard = guard;
+   s->Ist.Exit.jk    = jk;
+   s->Ist.Exit.dst   = dst;
+   return s;
+}
+
+
+/* Constructors -- IRTypeEnv */
+
+IRTypeEnv* emptyIRTypeEnv ( void )
+{
+   IRTypeEnv* env   = LibVEX_Alloc(sizeof(IRTypeEnv));
+   env->types       = LibVEX_Alloc(8 * sizeof(IRType));
+   env->types_size  = 8;
+   env->types_used  = 0;
+   return env;
+}
+
+
+/* Constructors -- IRSB */
+
+IRSB* emptyIRSB ( void )
+{
+   IRSB* bb       = LibVEX_Alloc(sizeof(IRSB));
+   bb->tyenv      = emptyIRTypeEnv();
+   bb->stmts_used = 0;
+   bb->stmts_size = 8;
+   bb->stmts      = LibVEX_Alloc(bb->stmts_size * sizeof(IRStmt*));
+   bb->next       = NULL;
+   bb->jumpkind   = Ijk_Boring;
+   return bb;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- (Deep) copy constructors.  These make complete copies   ---*/
+/*--- the original, which can be modified without affecting   ---*/
+/*--- the original.                                           ---*/
+/*---------------------------------------------------------------*/
+
+/* Copying IR Expr vectors (for call args). */
+
+/* Shallow copy of an IRExpr vector */
+
+IRExpr** shallowCopyIRExprVec ( IRExpr** vec )
+{
+   Int      i;
+   IRExpr** newvec;
+   for (i = 0; vec[i]; i++)
+      ;
+   newvec = LibVEX_Alloc((i+1)*sizeof(IRExpr*));
+   for (i = 0; vec[i]; i++)
+      newvec[i] = vec[i];
+   newvec[i] = NULL;
+   return newvec;
+}
+
+/* Deep copy of an IRExpr vector */
+
+IRExpr** deepCopyIRExprVec ( IRExpr** vec )
+{
+   Int      i;
+   IRExpr** newvec = shallowCopyIRExprVec( vec );
+   for (i = 0; newvec[i]; i++)
+      newvec[i] = deepCopyIRExpr(newvec[i]);
+   return newvec;
+}
+
+/* Deep copy constructors for all heap-allocated IR types follow. */
+
+IRConst* deepCopyIRConst ( IRConst* c )
+{
+   switch (c->tag) {
+      case Ico_U1:   return IRConst_U1(c->Ico.U1);
+      case Ico_U8:   return IRConst_U8(c->Ico.U8);
+      case Ico_U16:  return IRConst_U16(c->Ico.U16);
+      case Ico_U32:  return IRConst_U32(c->Ico.U32);
+      case Ico_U64:  return IRConst_U64(c->Ico.U64);
+      case Ico_F64:  return IRConst_F64(c->Ico.F64);
+      case Ico_F64i: return IRConst_F64i(c->Ico.F64i);
+      case Ico_V128: return IRConst_V128(c->Ico.V128);
+      default: vpanic("deepCopyIRConst");
+   }
+}
+
+IRCallee* deepCopyIRCallee ( IRCallee* ce )
+{
+   IRCallee* ce2 = mkIRCallee(ce->regparms, ce->name, ce->addr);
+   ce2->mcx_mask = ce->mcx_mask;
+   return ce2;
+}
+
+IRRegArray* deepCopyIRRegArray ( IRRegArray* d )
+{
+   return mkIRRegArray(d->base, d->elemTy, d->nElems);
+}
+
+IRExpr* deepCopyIRExpr ( IRExpr* e )
+{
+   switch (e->tag) {
+      case Iex_Get: 
+         return IRExpr_Get(e->Iex.Get.offset, e->Iex.Get.ty);
+      case Iex_GetI: 
+         return IRExpr_GetI(deepCopyIRRegArray(e->Iex.GetI.descr), 
+                            deepCopyIRExpr(e->Iex.GetI.ix),
+                            e->Iex.GetI.bias);
+      case Iex_RdTmp: 
+         return IRExpr_RdTmp(e->Iex.RdTmp.tmp);
+      case Iex_Qop: 
+         return IRExpr_Qop(e->Iex.Qop.op,
+                           deepCopyIRExpr(e->Iex.Qop.arg1),
+                           deepCopyIRExpr(e->Iex.Qop.arg2),
+                           deepCopyIRExpr(e->Iex.Qop.arg3),
+                           deepCopyIRExpr(e->Iex.Qop.arg4));
+      case Iex_Triop: 
+         return IRExpr_Triop(e->Iex.Triop.op,
+                             deepCopyIRExpr(e->Iex.Triop.arg1),
+                             deepCopyIRExpr(e->Iex.Triop.arg2),
+                             deepCopyIRExpr(e->Iex.Triop.arg3));
+      case Iex_Binop: 
+         return IRExpr_Binop(e->Iex.Binop.op,
+                             deepCopyIRExpr(e->Iex.Binop.arg1),
+                             deepCopyIRExpr(e->Iex.Binop.arg2));
+      case Iex_Unop: 
+         return IRExpr_Unop(e->Iex.Unop.op,
+                            deepCopyIRExpr(e->Iex.Unop.arg));
+      case Iex_Load: 
+         return IRExpr_Load(e->Iex.Load.end,
+                            e->Iex.Load.ty,
+                            deepCopyIRExpr(e->Iex.Load.addr));
+      case Iex_Const: 
+         return IRExpr_Const(deepCopyIRConst(e->Iex.Const.con));
+      case Iex_CCall:
+         return IRExpr_CCall(deepCopyIRCallee(e->Iex.CCall.cee),
+                             e->Iex.CCall.retty,
+                             deepCopyIRExprVec(e->Iex.CCall.args));
+
+      case Iex_Mux0X: 
+         return IRExpr_Mux0X(deepCopyIRExpr(e->Iex.Mux0X.cond),
+                             deepCopyIRExpr(e->Iex.Mux0X.expr0),
+                             deepCopyIRExpr(e->Iex.Mux0X.exprX));
+      default:
+         vpanic("deepCopyIRExpr");
+   }
+}
+
+IRDirty* deepCopyIRDirty ( IRDirty* d )
+{
+   Int      i;
+   IRDirty* d2 = emptyIRDirty();
+   d2->cee   = deepCopyIRCallee(d->cee);
+   d2->guard = deepCopyIRExpr(d->guard);
+   d2->args  = deepCopyIRExprVec(d->args);
+   d2->tmp   = d->tmp;
+   d2->mFx   = d->mFx;
+   d2->mAddr = d->mAddr==NULL ? NULL : deepCopyIRExpr(d->mAddr);
+   d2->mSize = d->mSize;
+   d2->needsBBP = d->needsBBP;
+   d2->nFxState = d->nFxState;
+   for (i = 0; i < d2->nFxState; i++)
+      d2->fxState[i] = d->fxState[i];
+   return d2;
+}
+
+IRCAS* deepCopyIRCAS ( IRCAS* cas )
+{
+   return mkIRCAS( cas->oldHi, cas->oldLo, cas->end,
+                   deepCopyIRExpr(cas->addr),
+                   cas->expdHi==NULL ? NULL : deepCopyIRExpr(cas->expdHi),
+                   deepCopyIRExpr(cas->expdLo),
+                   cas->dataHi==NULL ? NULL : deepCopyIRExpr(cas->dataHi),
+                   deepCopyIRExpr(cas->dataLo) );
+}
+
+IRStmt* deepCopyIRStmt ( IRStmt* s )
+{
+   switch (s->tag) {
+      case Ist_NoOp:
+         return IRStmt_NoOp();
+      case Ist_AbiHint:
+         return IRStmt_AbiHint(deepCopyIRExpr(s->Ist.AbiHint.base),
+                               s->Ist.AbiHint.len,
+                               deepCopyIRExpr(s->Ist.AbiHint.nia));
+      case Ist_IMark:
+         return IRStmt_IMark(s->Ist.IMark.addr, s->Ist.IMark.len);
+      case Ist_Put: 
+         return IRStmt_Put(s->Ist.Put.offset, 
+                           deepCopyIRExpr(s->Ist.Put.data));
+      case Ist_PutI: 
+         return IRStmt_PutI(deepCopyIRRegArray(s->Ist.PutI.descr),
+                            deepCopyIRExpr(s->Ist.PutI.ix),
+                            s->Ist.PutI.bias, 
+                            deepCopyIRExpr(s->Ist.PutI.data));
+      case Ist_WrTmp:
+         return IRStmt_WrTmp(s->Ist.WrTmp.tmp,
+                             deepCopyIRExpr(s->Ist.WrTmp.data));
+      case Ist_Store: 
+         return IRStmt_Store(s->Ist.Store.end,
+                             deepCopyIRExpr(s->Ist.Store.addr),
+                             deepCopyIRExpr(s->Ist.Store.data));
+      case Ist_CAS:
+         return IRStmt_CAS(deepCopyIRCAS(s->Ist.CAS.details));
+      case Ist_LLSC:
+         return IRStmt_LLSC(s->Ist.LLSC.end,
+                            s->Ist.LLSC.result,
+                            deepCopyIRExpr(s->Ist.LLSC.addr),
+                            s->Ist.LLSC.storedata
+                               ? deepCopyIRExpr(s->Ist.LLSC.storedata)
+                               : NULL);
+      case Ist_Dirty: 
+         return IRStmt_Dirty(deepCopyIRDirty(s->Ist.Dirty.details));
+      case Ist_MBE:
+         return IRStmt_MBE(s->Ist.MBE.event);
+      case Ist_Exit: 
+         return IRStmt_Exit(deepCopyIRExpr(s->Ist.Exit.guard),
+                            s->Ist.Exit.jk,
+                            deepCopyIRConst(s->Ist.Exit.dst));
+      default: 
+         vpanic("deepCopyIRStmt");
+   }
+}
+
+IRTypeEnv* deepCopyIRTypeEnv ( IRTypeEnv* src )
+{
+   Int        i;
+   IRTypeEnv* dst = LibVEX_Alloc(sizeof(IRTypeEnv));
+   dst->types_size = src->types_size;
+   dst->types_used = src->types_used;
+   dst->types = LibVEX_Alloc(dst->types_size * sizeof(IRType));
+   for (i = 0; i < src->types_used; i++)
+      dst->types[i] = src->types[i];
+   return dst;
+}
+
+IRSB* deepCopyIRSB ( IRSB* bb )
+{
+   Int      i;
+   IRStmt** sts2;
+   IRSB* bb2 = deepCopyIRSBExceptStmts(bb);
+   bb2->stmts_used = bb2->stmts_size = bb->stmts_used;
+   sts2 = LibVEX_Alloc(bb2->stmts_used * sizeof(IRStmt*));
+   for (i = 0; i < bb2->stmts_used; i++)
+      sts2[i] = deepCopyIRStmt(bb->stmts[i]);
+   bb2->stmts    = sts2;
+   return bb2;
+}
+
+IRSB* deepCopyIRSBExceptStmts ( IRSB* bb )
+{
+   IRSB* bb2     = emptyIRSB();
+   bb2->tyenv    = deepCopyIRTypeEnv(bb->tyenv);
+   bb2->next     = deepCopyIRExpr(bb->next);
+   bb2->jumpkind = bb->jumpkind;
+   return bb2;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Primop types                                            ---*/
+/*---------------------------------------------------------------*/
+
+static
+void typeOfPrimop ( IROp op, 
+                    /*OUTs*/
+                    IRType* t_dst, 
+                    IRType* t_arg1, IRType* t_arg2, 
+                    IRType* t_arg3, IRType* t_arg4 )
+{
+#  define UNARY(_ta1,_td)                                      \
+      *t_dst = (_td); *t_arg1 = (_ta1); break
+#  define BINARY(_ta1,_ta2,_td)                                \
+     *t_dst = (_td); *t_arg1 = (_ta1); *t_arg2 = (_ta2); break
+#  define TERNARY(_ta1,_ta2,_ta3,_td)                          \
+     *t_dst = (_td); *t_arg1 = (_ta1);                         \
+     *t_arg2 = (_ta2); *t_arg3 = (_ta3); break
+#  define QUATERNARY(_ta1,_ta2,_ta3,_ta4,_td)                  \
+     *t_dst = (_td); *t_arg1 = (_ta1);                         \
+     *t_arg2 = (_ta2); *t_arg3 = (_ta3);                       \
+     *t_arg4 = (_ta4); break
+#  define COMPARISON(_ta)                                      \
+     *t_dst = Ity_I1; *t_arg1 = *t_arg2 = (_ta); break;
+#  define UNARY_COMPARISON(_ta)                                \
+     *t_dst = Ity_I1; *t_arg1 = (_ta); break;
+
+   /* Rounding mode values are always Ity_I32, encoded as per
+      IRRoundingMode */
+   const IRType ity_RMode = Ity_I32;
+
+   *t_dst  = Ity_INVALID;
+   *t_arg1 = Ity_INVALID;
+   *t_arg2 = Ity_INVALID;
+   *t_arg3 = Ity_INVALID;
+   *t_arg4 = Ity_INVALID;
+   switch (op) {
+      case Iop_Add8: case Iop_Sub8: case Iop_Mul8: 
+      case Iop_Or8:  case Iop_And8: case Iop_Xor8:
+         BINARY(Ity_I8,Ity_I8, Ity_I8);
+
+      case Iop_Add16: case Iop_Sub16: case Iop_Mul16:
+      case Iop_Or16:  case Iop_And16: case Iop_Xor16:
+         BINARY(Ity_I16,Ity_I16, Ity_I16);
+
+      case Iop_CmpORD32U:
+      case Iop_CmpORD32S:
+      case Iop_Add32: case Iop_Sub32: case Iop_Mul32:
+      case Iop_Or32:  case Iop_And32: case Iop_Xor32:
+      case Iop_Max32U:
+      case Iop_Add16x2: case Iop_Sub16x2:
+      case Iop_QAdd16Sx2: case Iop_QAdd16Ux2:
+      case Iop_QSub16Sx2: case Iop_QSub16Ux2:
+      case Iop_HAdd16Ux2: case Iop_HAdd16Sx2:
+      case Iop_HSub16Ux2: case Iop_HSub16Sx2:
+      case Iop_Add8x4: case Iop_Sub8x4:
+      case Iop_QAdd8Sx4: case Iop_QAdd8Ux4:
+      case Iop_QSub8Sx4: case Iop_QSub8Ux4:
+      case Iop_HAdd8Ux4: case Iop_HAdd8Sx4:
+      case Iop_HSub8Ux4: case Iop_HSub8Sx4:
+      case Iop_Sad8Ux4:
+         BINARY(Ity_I32,Ity_I32, Ity_I32);
+
+      case Iop_Add64: case Iop_Sub64: case Iop_Mul64:
+      case Iop_Or64:  case Iop_And64: case Iop_Xor64:
+      case Iop_CmpORD64U:
+      case Iop_CmpORD64S:
+      case Iop_Avg8Ux8: case Iop_Avg16Ux4:
+      case Iop_Add8x8: case Iop_Add16x4: case Iop_Add32x2:
+      case Iop_Add32Fx2: case Iop_Sub32Fx2:
+      case Iop_CmpEQ8x8: case Iop_CmpEQ16x4: case Iop_CmpEQ32x2:
+      case Iop_CmpGT8Sx8: case Iop_CmpGT16Sx4: case Iop_CmpGT32Sx2:
+      case Iop_CmpGT8Ux8: case Iop_CmpGT16Ux4: case Iop_CmpGT32Ux2:
+      case Iop_CmpGT32Fx2: case Iop_CmpEQ32Fx2: case Iop_CmpGE32Fx2:
+      case Iop_InterleaveHI8x8: case Iop_InterleaveLO8x8:
+      case Iop_InterleaveHI16x4: case Iop_InterleaveLO16x4:
+      case Iop_InterleaveHI32x2: case Iop_InterleaveLO32x2:
+      case Iop_CatOddLanes8x8: case Iop_CatEvenLanes8x8:
+      case Iop_CatOddLanes16x4: case Iop_CatEvenLanes16x4:
+      case Iop_InterleaveOddLanes8x8: case Iop_InterleaveEvenLanes8x8:
+      case Iop_InterleaveOddLanes16x4: case Iop_InterleaveEvenLanes16x4:
+      case Iop_Perm8x8:
+      case Iop_Max8Ux8: case Iop_Max16Ux4: case Iop_Max32Ux2:
+      case Iop_Max8Sx8: case Iop_Max16Sx4: case Iop_Max32Sx2:
+      case Iop_Max32Fx2: case Iop_Min32Fx2:
+      case Iop_PwMax32Fx2: case Iop_PwMin32Fx2:
+      case Iop_Min8Ux8: case Iop_Min16Ux4: case Iop_Min32Ux2:
+      case Iop_Min8Sx8: case Iop_Min16Sx4: case Iop_Min32Sx2:
+      case Iop_PwMax8Ux8: case Iop_PwMax16Ux4: case Iop_PwMax32Ux2:
+      case Iop_PwMax8Sx8: case Iop_PwMax16Sx4: case Iop_PwMax32Sx2:
+      case Iop_PwMin8Ux8: case Iop_PwMin16Ux4: case Iop_PwMin32Ux2:
+      case Iop_PwMin8Sx8: case Iop_PwMin16Sx4: case Iop_PwMin32Sx2:
+      case Iop_Mul8x8: case Iop_Mul16x4: case Iop_Mul32x2:
+      case Iop_Mul32Fx2:
+      case Iop_PolynomialMul8x8:
+      case Iop_MulHi16Sx4: case Iop_MulHi16Ux4:
+      case Iop_QDMulHi16Sx4: case Iop_QDMulHi32Sx2:
+      case Iop_QRDMulHi16Sx4: case Iop_QRDMulHi32Sx2:
+      case Iop_QAdd8Sx8: case Iop_QAdd16Sx4:
+      case Iop_QAdd32Sx2: case Iop_QAdd64Sx1:
+      case Iop_QAdd8Ux8: case Iop_QAdd16Ux4:
+      case Iop_QAdd32Ux2: case Iop_QAdd64Ux1:
+      case Iop_PwAdd8x8: case Iop_PwAdd16x4: case Iop_PwAdd32x2:
+      case Iop_PwAdd32Fx2:
+      case Iop_QNarrow32Sx2:
+      case Iop_QNarrow16Sx4: case Iop_QNarrow16Ux4:
+      case Iop_Sub8x8: case Iop_Sub16x4: case Iop_Sub32x2:
+      case Iop_QSub8Sx8: case Iop_QSub16Sx4:
+      case Iop_QSub32Sx2: case Iop_QSub64Sx1:
+      case Iop_QSub8Ux8: case Iop_QSub16Ux4:
+      case Iop_QSub32Ux2: case Iop_QSub64Ux1:
+      case Iop_Shl8x8: case Iop_Shl16x4: case Iop_Shl32x2:
+      case Iop_Shr8x8: case Iop_Shr16x4: case Iop_Shr32x2:
+      case Iop_Sar8x8: case Iop_Sar16x4: case Iop_Sar32x2:
+      case Iop_Sal8x8: case Iop_Sal16x4: case Iop_Sal32x2: case Iop_Sal64x1:
+      case Iop_QShl8x8: case Iop_QShl16x4: case Iop_QShl32x2: case Iop_QShl64x1:
+      case Iop_QSal8x8: case Iop_QSal16x4: case Iop_QSal32x2: case Iop_QSal64x1:
+      case Iop_Recps32Fx2:
+      case Iop_Rsqrts32Fx2:
+         BINARY(Ity_I64,Ity_I64, Ity_I64);
+
+      case Iop_ShlN32x2: case Iop_ShlN16x4: case Iop_ShlN8x8:
+      case Iop_ShrN32x2: case Iop_ShrN16x4: case Iop_ShrN8x8:
+      case Iop_SarN32x2: case Iop_SarN16x4: case Iop_SarN8x8:
+      case Iop_QShlN8x8: case Iop_QShlN16x4:
+      case Iop_QShlN32x2: case Iop_QShlN64x1:
+      case Iop_QShlN8Sx8: case Iop_QShlN16Sx4:
+      case Iop_QShlN32Sx2: case Iop_QShlN64Sx1:
+      case Iop_QSalN8x8: case Iop_QSalN16x4:
+      case Iop_QSalN32x2: case Iop_QSalN64x1:
+         BINARY(Ity_I64,Ity_I8, Ity_I64);
+
+      case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
+         BINARY(Ity_I8,Ity_I8, Ity_I8);
+      case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
+         BINARY(Ity_I16,Ity_I8, Ity_I16);
+      case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
+         BINARY(Ity_I32,Ity_I8, Ity_I32);
+      case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
+         BINARY(Ity_I64,Ity_I8, Ity_I64);
+
+      case Iop_Not8:
+         UNARY(Ity_I8, Ity_I8);
+      case Iop_Not16:
+         UNARY(Ity_I16, Ity_I16);
+      case Iop_Not32:
+      case Iop_CmpNEZ16x2: case Iop_CmpNEZ8x4:
+         UNARY(Ity_I32, Ity_I32);
+
+      case Iop_Not64:
+      case Iop_CmpNEZ32x2: case Iop_CmpNEZ16x4: case Iop_CmpNEZ8x8:
+      case Iop_Cnt8x8:
+      case Iop_Clz8Sx8: case Iop_Clz16Sx4: case Iop_Clz32Sx2:
+      case Iop_Cls8Sx8: case Iop_Cls16Sx4: case Iop_Cls32Sx2:
+      case Iop_PwAddL8Ux8: case Iop_PwAddL16Ux4: case Iop_PwAddL32Ux2:
+      case Iop_PwAddL8Sx8: case Iop_PwAddL16Sx4: case Iop_PwAddL32Sx2:
+      case Iop_Reverse64_8x8: case Iop_Reverse64_16x4: case Iop_Reverse64_32x2:
+      case Iop_Reverse32_8x8: case Iop_Reverse32_16x4:
+      case Iop_Reverse16_8x8:
+      case Iop_FtoI32Sx2_RZ: case Iop_FtoI32Ux2_RZ:
+      case Iop_I32StoFx2: case Iop_I32UtoFx2:
+      case Iop_Recip32x2: case Iop_Recip32Fx2:
+      case Iop_Abs32Fx2:
+      case Iop_Rsqrte32Fx2:
+      case Iop_Rsqrte32x2:
+      case Iop_Neg32Fx2:
+      case Iop_Abs8x8: case Iop_Abs16x4: case Iop_Abs32x2:
+         UNARY(Ity_I64, Ity_I64);
+
+      case Iop_CmpEQ8: case Iop_CmpNE8:
+      case Iop_CasCmpEQ8: case Iop_CasCmpNE8:
+         COMPARISON(Ity_I8);
+      case Iop_CmpEQ16: case Iop_CmpNE16:
+      case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
+         COMPARISON(Ity_I16);
+      case Iop_CmpEQ32: case Iop_CmpNE32:
+      case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
+      case Iop_CmpLT32S: case Iop_CmpLE32S:
+      case Iop_CmpLT32U: case Iop_CmpLE32U:
+         COMPARISON(Ity_I32);
+      case Iop_CmpEQ64: case Iop_CmpNE64:
+      case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
+      case Iop_CmpLT64S: case Iop_CmpLE64S:
+      case Iop_CmpLT64U: case Iop_CmpLE64U:
+         COMPARISON(Ity_I64);
+
+      case Iop_CmpNEZ8:  UNARY_COMPARISON(Ity_I8);
+      case Iop_CmpNEZ16: UNARY_COMPARISON(Ity_I16);
+      case Iop_CmpNEZ32: UNARY_COMPARISON(Ity_I32);
+      case Iop_CmpNEZ64: UNARY_COMPARISON(Ity_I64);
+
+      case Iop_Left8:  UNARY(Ity_I8, Ity_I8);
+      case Iop_Left16: UNARY(Ity_I16,Ity_I16);
+      case Iop_CmpwNEZ32: case Iop_Left32: UNARY(Ity_I32,Ity_I32);
+      case Iop_CmpwNEZ64: case Iop_Left64: UNARY(Ity_I64,Ity_I64);
+
+      case Iop_MullU8: case Iop_MullS8:
+         BINARY(Ity_I8,Ity_I8, Ity_I16);
+      case Iop_MullU16: case Iop_MullS16:
+         BINARY(Ity_I16,Ity_I16, Ity_I32);
+      case Iop_MullU32: case Iop_MullS32:
+         BINARY(Ity_I32,Ity_I32, Ity_I64);
+      case Iop_MullU64: case Iop_MullS64:
+         BINARY(Ity_I64,Ity_I64, Ity_I128);
+
+      case Iop_Clz32: case Iop_Ctz32:
+         UNARY(Ity_I32, Ity_I32);
+
+      case Iop_Clz64: case Iop_Ctz64:
+         UNARY(Ity_I64, Ity_I64);
+
+      case Iop_DivU32: case Iop_DivS32:
+         BINARY(Ity_I32,Ity_I32, Ity_I32);
+
+      case Iop_DivU64: case Iop_DivS64:
+         BINARY(Ity_I64,Ity_I64, Ity_I64);
+
+      case Iop_DivModU64to32: case Iop_DivModS64to32:
+         BINARY(Ity_I64,Ity_I32, Ity_I64);
+
+      case Iop_DivModU128to64: case Iop_DivModS128to64:
+         BINARY(Ity_I128,Ity_I64, Ity_I128);
+
+      case Iop_16HIto8: case Iop_16to8:
+         UNARY(Ity_I16, Ity_I8);
+      case Iop_8HLto16:
+         BINARY(Ity_I8,Ity_I8, Ity_I16);
+
+      case Iop_32HIto16: case Iop_32to16:
+         UNARY(Ity_I32, Ity_I16);
+      case Iop_16HLto32:
+         BINARY(Ity_I16,Ity_I16, Ity_I32);
+
+      case Iop_64HIto32: case Iop_64to32:
+         UNARY(Ity_I64, Ity_I32);
+      case Iop_32HLto64:
+         BINARY(Ity_I32,Ity_I32, Ity_I64);
+
+      case Iop_128HIto64: case Iop_128to64:
+         UNARY(Ity_I128, Ity_I64);
+      case Iop_64HLto128:
+         BINARY(Ity_I64,Ity_I64, Ity_I128);
+
+      case Iop_Not1:   UNARY(Ity_I1, Ity_I1);
+      case Iop_1Uto8:  UNARY(Ity_I1, Ity_I8);
+      case Iop_1Sto8:  UNARY(Ity_I1, Ity_I8);
+      case Iop_1Sto16: UNARY(Ity_I1, Ity_I16);
+      case Iop_1Uto32: case Iop_1Sto32: UNARY(Ity_I1, Ity_I32);
+      case Iop_1Sto64: case Iop_1Uto64: UNARY(Ity_I1, Ity_I64);
+      case Iop_32to1:  UNARY(Ity_I32, Ity_I1);
+      case Iop_64to1:  UNARY(Ity_I64, Ity_I1);
+
+      case Iop_8Uto32: case Iop_8Sto32:
+         UNARY(Ity_I8, Ity_I32);
+
+      case Iop_8Uto16: case Iop_8Sto16:
+         UNARY(Ity_I8, Ity_I16);
+
+      case Iop_16Uto32: case Iop_16Sto32: 
+         UNARY(Ity_I16, Ity_I32);
+
+      case Iop_32Sto64: case Iop_32Uto64:
+         UNARY(Ity_I32, Ity_I64);
+
+      case Iop_8Uto64: case Iop_8Sto64:
+         UNARY(Ity_I8, Ity_I64);
+
+      case Iop_16Uto64: case Iop_16Sto64:
+         UNARY(Ity_I16, Ity_I64);
+      case Iop_64to16:
+         UNARY(Ity_I64, Ity_I16);
+
+      case Iop_32to8: UNARY(Ity_I32, Ity_I8);
+      case Iop_64to8: UNARY(Ity_I64, Ity_I8);
+
+      case Iop_AddF64:    case Iop_SubF64: 
+      case Iop_MulF64:    case Iop_DivF64:
+      case Iop_AddF64r32: case Iop_SubF64r32: 
+      case Iop_MulF64r32: case Iop_DivF64r32:
+         TERNARY(ity_RMode,Ity_F64,Ity_F64, Ity_F64);
+
+      case Iop_AddF32: case Iop_SubF32:
+      case Iop_MulF32: case Iop_DivF32:
+         TERNARY(ity_RMode,Ity_F32,Ity_F32, Ity_F32);
+
+      case Iop_NegF64: case Iop_AbsF64: 
+         UNARY(Ity_F64, Ity_F64);
+
+      case Iop_NegF32: case Iop_AbsF32:
+         UNARY(Ity_F32, Ity_F32);
+
+      case Iop_SqrtF64:
+      case Iop_SqrtF64r32:
+         BINARY(ity_RMode,Ity_F64, Ity_F64);
+
+      case Iop_SqrtF32:
+      case Iop_RoundF32toInt:
+         BINARY(ity_RMode,Ity_F32, Ity_F32);
+
+      case Iop_CmpF64:
+         BINARY(Ity_F64,Ity_F64, Ity_I32);
+
+      case Iop_F64toI16S: BINARY(ity_RMode,Ity_F64, Ity_I16);
+      case Iop_F64toI32S: BINARY(ity_RMode,Ity_F64, Ity_I32);
+      case Iop_F64toI64S: BINARY(ity_RMode,Ity_F64, Ity_I64);
+
+      case Iop_F64toI32U: BINARY(ity_RMode,Ity_F64, Ity_I32);
+
+      case Iop_I16StoF64: UNARY(Ity_I16, Ity_F64);
+      case Iop_I32StoF64: UNARY(Ity_I32, Ity_F64);
+      case Iop_I64StoF64: BINARY(ity_RMode,Ity_I64, Ity_F64);
+
+      case Iop_I32UtoF64: UNARY(Ity_I32, Ity_F64);
+
+      case Iop_F32toF64: UNARY(Ity_F32, Ity_F64);
+      case Iop_F64toF32: BINARY(ity_RMode,Ity_F64, Ity_F32);
+
+      case Iop_ReinterpI64asF64: UNARY(Ity_I64, Ity_F64);
+      case Iop_ReinterpF64asI64: UNARY(Ity_F64, Ity_I64);
+      case Iop_ReinterpI32asF32: UNARY(Ity_I32, Ity_F32);
+      case Iop_ReinterpF32asI32: UNARY(Ity_F32, Ity_I32);
+
+      case Iop_AtanF64: case Iop_Yl2xF64:  case Iop_Yl2xp1F64: 
+      case Iop_ScaleF64: case Iop_PRemF64: case Iop_PRem1F64:
+         TERNARY(ity_RMode,Ity_F64,Ity_F64, Ity_F64);
+
+      case Iop_PRemC3210F64: case Iop_PRem1C3210F64:
+         TERNARY(ity_RMode,Ity_F64,Ity_F64, Ity_I32);
+
+      case Iop_SinF64: case Iop_CosF64: case Iop_TanF64: 
+      case Iop_2xm1F64:
+      case Iop_RoundF64toInt: BINARY(ity_RMode,Ity_F64, Ity_F64);
+
+      case Iop_MAddF64: case Iop_MSubF64:
+      case Iop_MAddF64r32: case Iop_MSubF64r32:
+         QUATERNARY(ity_RMode,Ity_F64,Ity_F64,Ity_F64, Ity_F64);
+
+      case Iop_Est5FRSqrt:
+      case Iop_RoundF64toF64_NEAREST: case Iop_RoundF64toF64_NegINF:
+      case Iop_RoundF64toF64_PosINF: case Iop_RoundF64toF64_ZERO:
+         UNARY(Ity_F64, Ity_F64);
+      case Iop_RoundF64toF32:
+         BINARY(ity_RMode,Ity_F64, Ity_F64);
+      case Iop_CalcFPRF:
+         UNARY(Ity_F64, Ity_I32);
+      case Iop_TruncF64asF32:
+         UNARY(Ity_F64, Ity_F32);
+
+      case Iop_I32UtoFx4:
+      case Iop_I32StoFx4:
+      case Iop_QFtoI32Ux4_RZ:
+      case Iop_QFtoI32Sx4_RZ:
+      case Iop_FtoI32Ux4_RZ:
+      case Iop_FtoI32Sx4_RZ:
+      case Iop_RoundF32x4_RM:
+      case Iop_RoundF32x4_RP:
+      case Iop_RoundF32x4_RN:
+      case Iop_RoundF32x4_RZ:
+      case Iop_Abs32Fx4:
+      case Iop_Rsqrte32Fx4:
+      case Iop_Rsqrte32x4:
+         UNARY(Ity_V128, Ity_V128);
+
+      case Iop_64HLtoV128: BINARY(Ity_I64,Ity_I64, Ity_V128);
+      case Iop_V128to64: case Iop_V128HIto64:
+      case Iop_Shorten16x8: case Iop_Shorten32x4: case Iop_Shorten64x2:
+      case Iop_QShortenU16Ux8: case Iop_QShortenU32Ux4: case Iop_QShortenU64Ux2:
+      case Iop_QShortenS16Sx8: case Iop_QShortenS32Sx4: case Iop_QShortenS64Sx2:
+      case Iop_QShortenU16Sx8: case Iop_QShortenU32Sx4: case Iop_QShortenU64Sx2:
+      case Iop_F32toF16x4:
+         UNARY(Ity_V128, Ity_I64);
+
+      case Iop_Longen8Ux8: case Iop_Longen16Ux4: case Iop_Longen32Ux2:
+      case Iop_Longen8Sx8: case Iop_Longen16Sx4: case Iop_Longen32Sx2:
+      case Iop_F16toF32x4:
+         UNARY(Ity_I64, Ity_V128);
+
+      case Iop_V128to32:    UNARY(Ity_V128, Ity_I32);
+      case Iop_32UtoV128:   UNARY(Ity_I32, Ity_V128);
+      case Iop_64UtoV128:   UNARY(Ity_I64, Ity_V128);
+      case Iop_SetV128lo32: BINARY(Ity_V128,Ity_I32, Ity_V128);
+      case Iop_SetV128lo64: BINARY(Ity_V128,Ity_I64, Ity_V128);
+
+      case Iop_Dup8x16: UNARY(Ity_I8, Ity_V128);
+      case Iop_Dup16x8: UNARY(Ity_I16, Ity_V128);
+      case Iop_Dup32x4: UNARY(Ity_I32, Ity_V128);
+      case Iop_Dup8x8:  UNARY(Ity_I8, Ity_I64);
+      case Iop_Dup16x4: UNARY(Ity_I16, Ity_I64);
+      case Iop_Dup32x2: UNARY(Ity_I32, Ity_I64);
+
+      case Iop_CmpEQ32Fx4: case Iop_CmpLT32Fx4:
+      case Iop_CmpEQ64Fx2: case Iop_CmpLT64Fx2:
+      case Iop_CmpLE32Fx4: case Iop_CmpUN32Fx4:
+      case Iop_CmpLE64Fx2: case Iop_CmpUN64Fx2:
+      case Iop_CmpGT32Fx4: case Iop_CmpGE32Fx4:
+      case Iop_CmpEQ32F0x4: case Iop_CmpLT32F0x4:
+      case Iop_CmpEQ64F0x2: case Iop_CmpLT64F0x2:
+      case Iop_CmpLE32F0x4: case Iop_CmpUN32F0x4:
+      case Iop_CmpLE64F0x2: case Iop_CmpUN64F0x2:
+      case Iop_Add32Fx4: case Iop_Add32F0x4:
+      case Iop_Add64Fx2: case Iop_Add64F0x2:
+      case Iop_Div32Fx4: case Iop_Div32F0x4:
+      case Iop_Div64Fx2: case Iop_Div64F0x2:
+      case Iop_Max32Fx4: case Iop_Max32F0x4:
+      case Iop_PwMax32Fx4: case Iop_PwMin32Fx4:
+      case Iop_Max64Fx2: case Iop_Max64F0x2:
+      case Iop_Min32Fx4: case Iop_Min32F0x4:
+      case Iop_Min64Fx2: case Iop_Min64F0x2:
+      case Iop_Mul32Fx4: case Iop_Mul32F0x4:
+      case Iop_Mul64Fx2: case Iop_Mul64F0x2:
+      case Iop_Sub32Fx4: case Iop_Sub32F0x4:
+      case Iop_Sub64Fx2: case Iop_Sub64F0x2:
+      case Iop_AndV128: case Iop_OrV128: case Iop_XorV128:
+      case Iop_Add8x16:   case Iop_Add16x8:   
+      case Iop_Add32x4:   case Iop_Add64x2:
+      case Iop_QAdd8Ux16: case Iop_QAdd16Ux8:
+      case Iop_QAdd32Ux4: //case Iop_QAdd64Ux2:
+      case Iop_QAdd8Sx16: case Iop_QAdd16Sx8:
+      case Iop_QAdd32Sx4: case Iop_QAdd64Sx2:
+      case Iop_PwAdd8x16: case Iop_PwAdd16x8: case Iop_PwAdd32x4:
+      case Iop_Sub8x16:   case Iop_Sub16x8:
+      case Iop_Sub32x4:   case Iop_Sub64x2:
+      case Iop_QSub8Ux16: case Iop_QSub16Ux8:
+      case Iop_QSub32Ux4: //case Iop_QSub64Ux2:
+      case Iop_QSub8Sx16: case Iop_QSub16Sx8:
+      case Iop_QSub32Sx4: case Iop_QSub64Sx2:
+      case Iop_Mul8x16: case Iop_Mul16x8: case Iop_Mul32x4:
+      case Iop_PolynomialMul8x16:
+      case Iop_MulHi16Ux8: case Iop_MulHi32Ux4: 
+      case Iop_MulHi16Sx8: case Iop_MulHi32Sx4: 
+      case Iop_QDMulHi16Sx8: case Iop_QDMulHi32Sx4:
+      case Iop_QRDMulHi16Sx8: case Iop_QRDMulHi32Sx4:
+      case Iop_MullEven8Ux16: case Iop_MullEven16Ux8:
+      case Iop_MullEven8Sx16: case Iop_MullEven16Sx8:
+      case Iop_Avg8Ux16: case Iop_Avg16Ux8: case Iop_Avg32Ux4:
+      case Iop_Avg8Sx16: case Iop_Avg16Sx8: case Iop_Avg32Sx4:
+      case Iop_Max8Sx16: case Iop_Max16Sx8: case Iop_Max32Sx4:
+      case Iop_Max8Ux16: case Iop_Max16Ux8: case Iop_Max32Ux4:
+      case Iop_Min8Sx16: case Iop_Min16Sx8: case Iop_Min32Sx4:
+      case Iop_Min8Ux16: case Iop_Min16Ux8: case Iop_Min32Ux4:
+      case Iop_CmpEQ8x16:  case Iop_CmpEQ16x8:  case Iop_CmpEQ32x4:
+      case Iop_CmpGT8Sx16: case Iop_CmpGT16Sx8: case Iop_CmpGT32Sx4:
+      case Iop_CmpGT64Sx2:
+      case Iop_CmpGT8Ux16: case Iop_CmpGT16Ux8: case Iop_CmpGT32Ux4:
+      case Iop_Shl8x16: case Iop_Shl16x8: case Iop_Shl32x4: case Iop_Shl64x2:
+      case Iop_QShl8x16: case Iop_QShl16x8: case Iop_QShl32x4: case Iop_QShl64x2:
+      case Iop_QSal8x16: case Iop_QSal16x8: case Iop_QSal32x4: case Iop_QSal64x2:
+      case Iop_Shr8x16: case Iop_Shr16x8: case Iop_Shr32x4: case Iop_Shr64x2:
+      case Iop_Sar8x16: case Iop_Sar16x8: case Iop_Sar32x4: case Iop_Sar64x2:
+      case Iop_Sal8x16: case Iop_Sal16x8: case Iop_Sal32x4: case Iop_Sal64x2:
+      case Iop_Rol8x16: case Iop_Rol16x8: case Iop_Rol32x4:
+      case Iop_QNarrow16Ux8: case Iop_QNarrow32Ux4:
+      case Iop_QNarrow16Sx8: case Iop_QNarrow32Sx4:
+      case Iop_Narrow16x8:   case Iop_Narrow32x4:
+      case Iop_InterleaveHI8x16: case Iop_InterleaveHI16x8:
+      case Iop_InterleaveHI32x4: case Iop_InterleaveHI64x2:
+      case Iop_InterleaveLO8x16: case Iop_InterleaveLO16x8:
+      case Iop_InterleaveLO32x4: case Iop_InterleaveLO64x2:
+      case Iop_CatOddLanes8x16: case Iop_CatEvenLanes8x16:
+      case Iop_CatOddLanes16x8: case Iop_CatEvenLanes16x8:
+      case Iop_CatOddLanes32x4: case Iop_CatEvenLanes32x4:
+      case Iop_InterleaveOddLanes8x16: case Iop_InterleaveEvenLanes8x16:
+      case Iop_InterleaveOddLanes16x8: case Iop_InterleaveEvenLanes16x8:
+      case Iop_InterleaveOddLanes32x4: case Iop_InterleaveEvenLanes32x4:
+      case Iop_Perm8x16:
+      case Iop_Recps32Fx4:
+      case Iop_Rsqrts32Fx4:
+         BINARY(Ity_V128,Ity_V128, Ity_V128);
+
+      case Iop_PolynomialMull8x8:
+      case Iop_Mull8Ux8: case Iop_Mull8Sx8:
+      case Iop_Mull16Ux4: case Iop_Mull16Sx4:
+      case Iop_Mull32Ux2: case Iop_Mull32Sx2:
+         BINARY(Ity_I64, Ity_I64, Ity_V128);
+
+      case Iop_NotV128:
+      case Iop_Recip32Fx4: case Iop_Recip32F0x4:
+      case Iop_Recip32x4:
+      case Iop_Recip64Fx2: case Iop_Recip64F0x2:
+      case Iop_RSqrt32Fx4: case Iop_RSqrt32F0x4:
+      case Iop_RSqrt64Fx2: case Iop_RSqrt64F0x2:
+      case Iop_Sqrt32Fx4:  case Iop_Sqrt32F0x4:
+      case Iop_Sqrt64Fx2:  case Iop_Sqrt64F0x2:
+      case Iop_CmpNEZ8x16: case Iop_CmpNEZ16x8:
+      case Iop_CmpNEZ32x4: case Iop_CmpNEZ64x2:
+      case Iop_Cnt8x16:
+      case Iop_Clz8Sx16: case Iop_Clz16Sx8: case Iop_Clz32Sx4:
+      case Iop_Cls8Sx16: case Iop_Cls16Sx8: case Iop_Cls32Sx4:
+      case Iop_PwAddL8Ux16: case Iop_PwAddL16Ux8: case Iop_PwAddL32Ux4:
+      case Iop_PwAddL8Sx16: case Iop_PwAddL16Sx8: case Iop_PwAddL32Sx4:
+      case Iop_Reverse64_8x16: case Iop_Reverse64_16x8: case Iop_Reverse64_32x4:
+      case Iop_Reverse32_8x16: case Iop_Reverse32_16x8:
+      case Iop_Reverse16_8x16:
+      case Iop_Neg32Fx4:
+      case Iop_Abs8x16: case Iop_Abs16x8: case Iop_Abs32x4:
+         UNARY(Ity_V128, Ity_V128);
+
+      case Iop_ShlV128: case Iop_ShrV128:
+      case Iop_ShlN8x16: case Iop_ShlN16x8: 
+      case Iop_ShlN32x4: case Iop_ShlN64x2:
+      case Iop_ShrN8x16: case Iop_ShrN16x8: 
+      case Iop_ShrN32x4: case Iop_ShrN64x2:
+      case Iop_SarN8x16: case Iop_SarN16x8:
+      case Iop_SarN32x4: case Iop_SarN64x2:
+      case Iop_QShlN8x16: case Iop_QShlN16x8:
+      case Iop_QShlN32x4: case Iop_QShlN64x2:
+      case Iop_QShlN8Sx16: case Iop_QShlN16Sx8:
+      case Iop_QShlN32Sx4: case Iop_QShlN64Sx2:
+      case Iop_QSalN8x16: case Iop_QSalN16x8:
+      case Iop_QSalN32x4: case Iop_QSalN64x2:
+         BINARY(Ity_V128,Ity_I8, Ity_V128);
+
+      case Iop_F32ToFixed32Ux4_RZ:
+      case Iop_F32ToFixed32Sx4_RZ:
+      case Iop_Fixed32UToF32x4_RN:
+      case Iop_Fixed32SToF32x4_RN:
+         BINARY(Ity_V128, Ity_I8, Ity_V128);
+
+      case Iop_F32ToFixed32Ux2_RZ:
+      case Iop_F32ToFixed32Sx2_RZ:
+      case Iop_Fixed32UToF32x2_RN:
+      case Iop_Fixed32SToF32x2_RN:
+         BINARY(Ity_I64, Ity_I8, Ity_I64);
+
+      case Iop_GetElem8x16:
+         BINARY(Ity_V128, Ity_I8, Ity_I8);
+      case Iop_GetElem16x8:
+         BINARY(Ity_V128, Ity_I8, Ity_I16);
+      case Iop_GetElem32x4:
+         BINARY(Ity_V128, Ity_I8, Ity_I32);
+      case Iop_GetElem64x2:
+         BINARY(Ity_V128, Ity_I8, Ity_I64);
+      case Iop_GetElem8x8:
+         BINARY(Ity_I64, Ity_I8, Ity_I8);
+      case Iop_GetElem16x4:
+         BINARY(Ity_I64, Ity_I8, Ity_I16);
+      case Iop_GetElem32x2:
+         BINARY(Ity_I64, Ity_I8, Ity_I32);
+      case Iop_SetElem8x8:
+         TERNARY(Ity_I64, Ity_I8, Ity_I8, Ity_I64);
+      case Iop_SetElem16x4:
+         TERNARY(Ity_I64, Ity_I8, Ity_I16, Ity_I64);
+      case Iop_SetElem32x2:
+         TERNARY(Ity_I64, Ity_I8, Ity_I32, Ity_I64);
+
+      case Iop_Extract64:
+         TERNARY(Ity_I64, Ity_I64, Ity_I8, Ity_I64);
+      case Iop_ExtractV128:
+         TERNARY(Ity_V128, Ity_V128, Ity_I8, Ity_V128);
+
+      case Iop_QDMulLong16Sx4: case Iop_QDMulLong32Sx2:
+         BINARY(Ity_I64, Ity_I64, Ity_V128);
+
+      default:
+         ppIROp(op);
+         vpanic("typeOfPrimop");
+   }
+#  undef UNARY
+#  undef BINARY
+#  undef TERNARY
+#  undef COMPARISON
+#  undef UNARY_COMPARISON
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Helper functions for the IR -- IR Basic Blocks          ---*/
+/*---------------------------------------------------------------*/
+
+void addStmtToIRSB ( IRSB* bb, IRStmt* st )
+{
+   Int i;
+   if (bb->stmts_used == bb->stmts_size) {
+      IRStmt** stmts2 = LibVEX_Alloc(2 * bb->stmts_size * sizeof(IRStmt*));
+      for (i = 0; i < bb->stmts_size; i++)
+         stmts2[i] = bb->stmts[i];
+      bb->stmts = stmts2;
+      bb->stmts_size *= 2;
+   }
+   vassert(bb->stmts_used < bb->stmts_size);
+   bb->stmts[bb->stmts_used] = st;
+   bb->stmts_used++;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Helper functions for the IR -- IR Type Environments     ---*/
+/*---------------------------------------------------------------*/
+
+/* Allocate a new IRTemp, given its type. */
+
+IRTemp newIRTemp ( IRTypeEnv* env, IRType ty )
+{
+   vassert(env);
+   vassert(env->types_used >= 0);
+   vassert(env->types_size >= 0);
+   vassert(env->types_used <= env->types_size);
+   if (env->types_used < env->types_size) {
+      env->types[env->types_used] = ty;
+      return env->types_used++;
+   } else {
+      Int i;
+      Int new_size = env->types_size==0 ? 8 : 2*env->types_size;
+      IRType* new_types 
+         = LibVEX_Alloc(new_size * sizeof(IRType));
+      for (i = 0; i < env->types_used; i++)
+         new_types[i] = env->types[i];
+      env->types      = new_types;
+      env->types_size = new_size;
+      return newIRTemp(env, ty);
+   }
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Helper functions for the IR -- finding types of exprs   ---*/
+/*---------------------------------------------------------------*/
+
+inline 
+IRType typeOfIRTemp ( IRTypeEnv* env, IRTemp tmp )
+{
+   vassert(tmp >= 0);
+   vassert(tmp < env->types_used);
+   return env->types[tmp];
+}
+
+
+IRType typeOfIRConst ( IRConst* con )
+{
+   switch (con->tag) {
+      case Ico_U1:    return Ity_I1;
+      case Ico_U8:    return Ity_I8;
+      case Ico_U16:   return Ity_I16;
+      case Ico_U32:   return Ity_I32;
+      case Ico_U64:   return Ity_I64;
+      case Ico_F64:   return Ity_F64;
+      case Ico_F64i:  return Ity_F64;
+      case Ico_V128:  return Ity_V128;
+      default: vpanic("typeOfIRConst");
+   }
+}
+
+IRType typeOfIRExpr ( IRTypeEnv* tyenv, IRExpr* e )
+{
+   IRType t_dst, t_arg1, t_arg2, t_arg3, t_arg4;
+ start:
+   switch (e->tag) {
+      case Iex_Load:
+         return e->Iex.Load.ty;
+      case Iex_Get:
+         return e->Iex.Get.ty;
+      case Iex_GetI:
+         return e->Iex.GetI.descr->elemTy;
+      case Iex_RdTmp:
+         return typeOfIRTemp(tyenv, e->Iex.RdTmp.tmp);
+      case Iex_Const:
+         return typeOfIRConst(e->Iex.Const.con);
+      case Iex_Qop:
+         typeOfPrimop(e->Iex.Qop.op, 
+                      &t_dst, &t_arg1, &t_arg2, &t_arg3, &t_arg4);
+         return t_dst;
+      case Iex_Triop:
+         typeOfPrimop(e->Iex.Triop.op, 
+                      &t_dst, &t_arg1, &t_arg2, &t_arg3, &t_arg4);
+         return t_dst;
+      case Iex_Binop:
+         typeOfPrimop(e->Iex.Binop.op, 
+                      &t_dst, &t_arg1, &t_arg2, &t_arg3, &t_arg4);
+         return t_dst;
+      case Iex_Unop:
+         typeOfPrimop(e->Iex.Unop.op, 
+                      &t_dst, &t_arg1, &t_arg2, &t_arg3, &t_arg4);
+         return t_dst;
+      case Iex_CCall:
+         return e->Iex.CCall.retty;
+      case Iex_Mux0X:
+         e = e->Iex.Mux0X.expr0;
+         goto start;
+         /* return typeOfIRExpr(tyenv, e->Iex.Mux0X.expr0); */
+      case Iex_Binder:
+         vpanic("typeOfIRExpr: Binder is not a valid expression");
+      default:
+         ppIRExpr(e);
+         vpanic("typeOfIRExpr");
+   }
+}
+
+/* Is this any value actually in the enumeration 'IRType' ? */
+Bool isPlausibleIRType ( IRType ty )
+{
+   switch (ty) {
+      case Ity_INVALID: case Ity_I1:
+      case Ity_I8: case Ity_I16: case Ity_I32: 
+      case Ity_I64: case Ity_I128:
+      case Ity_F32: case Ity_F64:
+      case Ity_V128:
+         return True;
+      default: 
+         return False;
+   }
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Sanity checking -- FLATNESS                             ---*/
+/*---------------------------------------------------------------*/
+
+/* Check that the canonical flatness constraints hold on an
+   IRStmt. The only place where any expression is allowed to be
+   non-atomic is the RHS of IRStmt_Tmp. */
+
+/* Relies on:
+   inline static Bool isAtom ( IRExpr* e ) {
+      return e->tag == Iex_RdTmp || e->tag == Iex_Const;
+   }
+*/
+
+Bool isFlatIRStmt ( IRStmt* st )
+{
+   Int      i;
+   IRExpr*  e;
+   IRDirty* di;
+   IRCAS*   cas;
+
+   switch (st->tag) {
+      case Ist_AbiHint:
+         return isIRAtom(st->Ist.AbiHint.base)
+                && isIRAtom(st->Ist.AbiHint.nia);
+      case Ist_Put:
+         return isIRAtom(st->Ist.Put.data);
+      case Ist_PutI:
+         return toBool( isIRAtom(st->Ist.PutI.ix) 
+                        && isIRAtom(st->Ist.PutI.data) );
+      case Ist_WrTmp:
+         /* This is the only interesting case.  The RHS can be any
+            expression, *but* all its subexpressions *must* be
+            atoms. */
+         e = st->Ist.WrTmp.data;
+         switch (e->tag) {
+            case Iex_Binder: return True;
+            case Iex_Get:    return True;
+            case Iex_GetI:   return isIRAtom(e->Iex.GetI.ix);
+            case Iex_RdTmp:  return True;
+            case Iex_Qop:    return toBool(
+                                    isIRAtom(e->Iex.Qop.arg1) 
+                                    && isIRAtom(e->Iex.Qop.arg2)
+                                    && isIRAtom(e->Iex.Qop.arg3)
+                                    && isIRAtom(e->Iex.Qop.arg4));
+            case Iex_Triop:  return toBool(
+                                    isIRAtom(e->Iex.Triop.arg1) 
+                                    && isIRAtom(e->Iex.Triop.arg2)
+                                    && isIRAtom(e->Iex.Triop.arg3));
+            case Iex_Binop:  return toBool(
+                                    isIRAtom(e->Iex.Binop.arg1) 
+                                    && isIRAtom(e->Iex.Binop.arg2));
+            case Iex_Unop:   return isIRAtom(e->Iex.Unop.arg);
+            case Iex_Load:   return isIRAtom(e->Iex.Load.addr);
+            case Iex_Const:  return True;
+            case Iex_CCall:  for (i = 0; e->Iex.CCall.args[i]; i++)
+                                if (!isIRAtom(e->Iex.CCall.args[i])) 
+                                   return False;
+                             return True;
+            case Iex_Mux0X:  return toBool (
+                                    isIRAtom(e->Iex.Mux0X.cond) 
+                                    && isIRAtom(e->Iex.Mux0X.expr0) 
+                                    && isIRAtom(e->Iex.Mux0X.exprX));
+            default:         vpanic("isFlatIRStmt(e)");
+         }
+         /*notreached*/
+         vassert(0);
+      case Ist_Store:
+         return toBool( isIRAtom(st->Ist.Store.addr) 
+                        && isIRAtom(st->Ist.Store.data) );
+      case Ist_CAS:
+         cas = st->Ist.CAS.details;
+         return toBool( isIRAtom(cas->addr)
+                        && (cas->expdHi ? isIRAtom(cas->expdHi) : True)
+                        && isIRAtom(cas->expdLo)
+                        && (cas->dataHi ? isIRAtom(cas->dataHi) : True)
+                        && isIRAtom(cas->dataLo) );
+      case Ist_LLSC:
+         return toBool( isIRAtom(st->Ist.LLSC.addr)
+                        && (st->Ist.LLSC.storedata
+                               ? isIRAtom(st->Ist.LLSC.storedata) : True) );
+      case Ist_Dirty:
+         di = st->Ist.Dirty.details;
+         if (!isIRAtom(di->guard)) 
+            return False;
+         for (i = 0; di->args[i]; i++)
+            if (!isIRAtom(di->args[i])) 
+               return False;
+         if (di->mAddr && !isIRAtom(di->mAddr)) 
+            return False;
+         return True;
+      case Ist_NoOp:
+      case Ist_IMark:
+      case Ist_MBE:
+         return True;
+      case Ist_Exit:
+         return isIRAtom(st->Ist.Exit.guard);
+      default: 
+         vpanic("isFlatIRStmt(st)");
+   }
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Sanity checking                                         ---*/
+/*---------------------------------------------------------------*/
+
+/* Checks:
+
+   Everything is type-consistent.  No ill-typed anything.
+   The target address at the end of the BB is a 32- or 64-
+   bit expression, depending on the guest's word size.
+
+   Each temp is assigned only once, before its uses.
+*/
+
+static inline Int countArgs ( IRExpr** args )
+{
+   Int i;
+   for (i = 0; args[i]; i++)
+      ;
+   return i;
+}
+
+static
+__attribute((noreturn))
+void sanityCheckFail ( IRSB* bb, IRStmt* stmt, HChar* what )
+{
+   vex_printf("\nIR SANITY CHECK FAILURE\n\n");
+   ppIRSB(bb);
+   if (stmt) {
+      vex_printf("\nIN STATEMENT:\n\n");
+      ppIRStmt(stmt);
+   }
+   vex_printf("\n\nERROR = %s\n\n", what );
+   vpanic("sanityCheckFail: exiting due to bad IR");
+}
+
+static Bool saneIRRegArray ( IRRegArray* arr )
+{
+   if (arr->base < 0 || arr->base > 10000 /* somewhat arbitrary */)
+      return False;
+   if (arr->elemTy == Ity_I1)
+      return False;
+   if (arr->nElems <= 0 || arr->nElems > 500 /* somewhat arbitrary */)
+      return False;
+   return True;
+}
+
+static Bool saneIRCallee ( IRCallee* cee )
+{
+   if (cee->name == NULL)
+      return False;
+   if (cee->addr == 0)
+      return False;
+   if (cee->regparms < 0 || cee->regparms > 3)
+      return False;
+   return True;
+}
+
+static Bool saneIRConst ( IRConst* con )
+{
+   switch (con->tag) {
+      case Ico_U1: 
+         return toBool( con->Ico.U1 == True || con->Ico.U1 == False );
+      default: 
+         /* Is there anything we can meaningfully check?  I don't
+            think so. */
+         return True;
+   }
+}
+
+/* Traverse a Stmt/Expr, inspecting IRTemp uses.  Report any out of
+   range ones.  Report any which are read and for which the current
+   def_count is zero. */
+
+static
+void useBeforeDef_Temp ( IRSB* bb, IRStmt* stmt, IRTemp tmp, Int* def_counts )
+{
+   if (tmp < 0 || tmp >= bb->tyenv->types_used)
+      sanityCheckFail(bb,stmt, "out of range Temp in IRExpr");
+   if (def_counts[tmp] < 1)
+      sanityCheckFail(bb,stmt, "IRTemp use before def in IRExpr");
+}
+
+static
+void useBeforeDef_Expr ( IRSB* bb, IRStmt* stmt, IRExpr* expr, Int* def_counts )
+{
+   Int i;
+   switch (expr->tag) {
+      case Iex_Get: 
+         break;
+      case Iex_GetI:
+         useBeforeDef_Expr(bb,stmt,expr->Iex.GetI.ix,def_counts);
+         break;
+      case Iex_RdTmp:
+         useBeforeDef_Temp(bb,stmt,expr->Iex.RdTmp.tmp,def_counts);
+         break;
+      case Iex_Qop:
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Qop.arg1,def_counts);
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Qop.arg2,def_counts);
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Qop.arg3,def_counts);
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Qop.arg4,def_counts);
+         break;
+      case Iex_Triop:
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Triop.arg1,def_counts);
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Triop.arg2,def_counts);
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Triop.arg3,def_counts);
+         break;
+      case Iex_Binop:
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Binop.arg1,def_counts);
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Binop.arg2,def_counts);
+         break;
+      case Iex_Unop:
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Unop.arg,def_counts);
+         break;
+      case Iex_Load:
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Load.addr,def_counts);
+         break;
+      case Iex_Const:
+         break;
+      case Iex_CCall:
+         for (i = 0; expr->Iex.CCall.args[i]; i++)
+            useBeforeDef_Expr(bb,stmt,expr->Iex.CCall.args[i],def_counts);
+         break;
+      case Iex_Mux0X:
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Mux0X.cond,def_counts);
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Mux0X.expr0,def_counts);
+         useBeforeDef_Expr(bb,stmt,expr->Iex.Mux0X.exprX,def_counts);
+         break;
+      default:
+         vpanic("useBeforeDef_Expr");
+   }
+}
+
+static
+void useBeforeDef_Stmt ( IRSB* bb, IRStmt* stmt, Int* def_counts )
+{
+   Int      i;
+   IRDirty* d;
+   IRCAS*   cas;
+   switch (stmt->tag) {
+      case Ist_IMark:
+         break;
+      case Ist_AbiHint:
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.AbiHint.base,def_counts);
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.AbiHint.nia,def_counts);
+         break;
+      case Ist_Put:
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.Put.data,def_counts);
+         break;
+      case Ist_PutI:
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.PutI.ix,def_counts);
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.PutI.data,def_counts);
+         break;
+      case Ist_WrTmp:
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.WrTmp.data,def_counts);
+         break;
+      case Ist_Store:
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.Store.addr,def_counts);
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.Store.data,def_counts);
+         break;
+      case Ist_CAS:
+         cas = stmt->Ist.CAS.details;
+         useBeforeDef_Expr(bb,stmt,cas->addr,def_counts);
+         if (cas->expdHi)
+            useBeforeDef_Expr(bb,stmt,cas->expdHi,def_counts);
+         useBeforeDef_Expr(bb,stmt,cas->expdLo,def_counts);
+         if (cas->dataHi)
+            useBeforeDef_Expr(bb,stmt,cas->dataHi,def_counts);
+         useBeforeDef_Expr(bb,stmt,cas->dataLo,def_counts);
+         break;
+      case Ist_LLSC:
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.LLSC.addr,def_counts);
+         if (stmt->Ist.LLSC.storedata != NULL)
+            useBeforeDef_Expr(bb,stmt,stmt->Ist.LLSC.storedata,def_counts);
+         break;
+      case Ist_Dirty:
+         d = stmt->Ist.Dirty.details;
+         for (i = 0; d->args[i] != NULL; i++)
+            useBeforeDef_Expr(bb,stmt,d->args[i],def_counts);
+         if (d->mFx != Ifx_None)
+            useBeforeDef_Expr(bb,stmt,d->mAddr,def_counts);
+         break;
+      case Ist_NoOp:
+      case Ist_MBE:
+         break;
+      case Ist_Exit:
+         useBeforeDef_Expr(bb,stmt,stmt->Ist.Exit.guard,def_counts);
+         break;
+      default: 
+         vpanic("useBeforeDef_Stmt");
+   }
+}
+
+static
+void tcExpr ( IRSB* bb, IRStmt* stmt, IRExpr* expr, IRType gWordTy )
+{
+   Int        i;
+   IRType     t_dst, t_arg1, t_arg2, t_arg3, t_arg4;
+   IRTypeEnv* tyenv = bb->tyenv;
+   switch (expr->tag) {
+      case Iex_Get:
+      case Iex_RdTmp:
+         break;
+      case Iex_GetI:
+         tcExpr(bb,stmt, expr->Iex.GetI.ix, gWordTy );
+         if (typeOfIRExpr(tyenv,expr->Iex.GetI.ix) != Ity_I32)
+            sanityCheckFail(bb,stmt,"IRExpr.GetI.ix: not :: Ity_I32");
+         if (!saneIRRegArray(expr->Iex.GetI.descr))
+            sanityCheckFail(bb,stmt,"IRExpr.GetI.descr: invalid descr");
+         break;
+      case Iex_Qop: {
+         IRType ttarg1, ttarg2, ttarg3, ttarg4;
+         tcExpr(bb,stmt, expr->Iex.Qop.arg1, gWordTy );
+         tcExpr(bb,stmt, expr->Iex.Qop.arg2, gWordTy );
+         tcExpr(bb,stmt, expr->Iex.Qop.arg3, gWordTy );
+         tcExpr(bb,stmt, expr->Iex.Qop.arg4, gWordTy );
+         typeOfPrimop(expr->Iex.Qop.op, 
+                      &t_dst, &t_arg1, &t_arg2, &t_arg3, &t_arg4);
+         if (t_arg1 == Ity_INVALID || t_arg2 == Ity_INVALID 
+             || t_arg3 == Ity_INVALID || t_arg4 == Ity_INVALID) {
+            vex_printf(" op name: " );
+            ppIROp(expr->Iex.Qop.op);
+            vex_printf("\n");
+            sanityCheckFail(bb,stmt,
+               "Iex.Qop: wrong arity op\n"
+               "... name of op precedes BB printout\n");
+         }
+         ttarg1 = typeOfIRExpr(tyenv, expr->Iex.Qop.arg1);
+         ttarg2 = typeOfIRExpr(tyenv, expr->Iex.Qop.arg2);
+         ttarg3 = typeOfIRExpr(tyenv, expr->Iex.Qop.arg3);
+         ttarg4 = typeOfIRExpr(tyenv, expr->Iex.Qop.arg4);
+         if (t_arg1 != ttarg1 || t_arg2 != ttarg2 
+             || t_arg3 != ttarg3 || t_arg4 != ttarg4) {
+            vex_printf(" op name: ");
+            ppIROp(expr->Iex.Qop.op);
+            vex_printf("\n");
+            vex_printf(" op type is (");
+            ppIRType(t_arg1);
+            vex_printf(",");
+            ppIRType(t_arg2);
+            vex_printf(",");
+            ppIRType(t_arg3);
+            vex_printf(",");
+            ppIRType(t_arg4);
+            vex_printf(") -> ");
+            ppIRType (t_dst);
+            vex_printf("\narg tys are (");
+            ppIRType(ttarg1);
+            vex_printf(",");
+            ppIRType(ttarg2);
+            vex_printf(",");
+            ppIRType(ttarg3);
+            vex_printf(",");
+            ppIRType(ttarg4);
+            vex_printf(")\n");
+            sanityCheckFail(bb,stmt,
+               "Iex.Qop: arg tys don't match op tys\n"
+               "... additional details precede BB printout\n");
+         }
+         break;
+      }
+      case Iex_Triop: {
+         IRType ttarg1, ttarg2, ttarg3;
+         tcExpr(bb,stmt, expr->Iex.Triop.arg1, gWordTy );
+         tcExpr(bb,stmt, expr->Iex.Triop.arg2, gWordTy );
+         tcExpr(bb,stmt, expr->Iex.Triop.arg3, gWordTy );
+         typeOfPrimop(expr->Iex.Triop.op, 
+                      &t_dst, &t_arg1, &t_arg2, &t_arg3, &t_arg4);
+         if (t_arg1 == Ity_INVALID || t_arg2 == Ity_INVALID 
+             || t_arg3 == Ity_INVALID || t_arg4 != Ity_INVALID) {
+            vex_printf(" op name: " );
+            ppIROp(expr->Iex.Triop.op);
+            vex_printf("\n");
+            sanityCheckFail(bb,stmt,
+               "Iex.Triop: wrong arity op\n"
+               "... name of op precedes BB printout\n");
+         }
+         ttarg1 = typeOfIRExpr(tyenv, expr->Iex.Triop.arg1);
+         ttarg2 = typeOfIRExpr(tyenv, expr->Iex.Triop.arg2);
+         ttarg3 = typeOfIRExpr(tyenv, expr->Iex.Triop.arg3);
+         if (t_arg1 != ttarg1 || t_arg2 != ttarg2 || t_arg3 != ttarg3) {
+            vex_printf(" op name: ");
+            ppIROp(expr->Iex.Triop.op);
+            vex_printf("\n");
+            vex_printf(" op type is (");
+            ppIRType(t_arg1);
+            vex_printf(",");
+            ppIRType(t_arg2);
+            vex_printf(",");
+            ppIRType(t_arg3);
+            vex_printf(") -> ");
+            ppIRType (t_dst);
+            vex_printf("\narg tys are (");
+            ppIRType(ttarg1);
+            vex_printf(",");
+            ppIRType(ttarg2);
+            vex_printf(",");
+            ppIRType(ttarg3);
+            vex_printf(")\n");
+            sanityCheckFail(bb,stmt,
+               "Iex.Triop: arg tys don't match op tys\n"
+               "... additional details precede BB printout\n");
+         }
+         break;
+      }
+      case Iex_Binop: {
+         IRType ttarg1, ttarg2;
+         tcExpr(bb,stmt, expr->Iex.Binop.arg1, gWordTy );
+         tcExpr(bb,stmt, expr->Iex.Binop.arg2, gWordTy );
+         typeOfPrimop(expr->Iex.Binop.op, 
+                      &t_dst, &t_arg1, &t_arg2, &t_arg3, &t_arg4);
+         if (t_arg1 == Ity_INVALID || t_arg2 == Ity_INVALID 
+             || t_arg3 != Ity_INVALID || t_arg4 != Ity_INVALID) {
+            vex_printf(" op name: " );
+            ppIROp(expr->Iex.Binop.op);
+            vex_printf("\n");
+            sanityCheckFail(bb,stmt,
+               "Iex.Binop: wrong arity op\n"
+               "... name of op precedes BB printout\n");
+         }
+         ttarg1 = typeOfIRExpr(tyenv, expr->Iex.Binop.arg1);
+         ttarg2 = typeOfIRExpr(tyenv, expr->Iex.Binop.arg2);
+         if (t_arg1 != ttarg1 || t_arg2 != ttarg2) {
+            vex_printf(" op name: ");
+            ppIROp(expr->Iex.Binop.op);
+            vex_printf("\n");
+            vex_printf(" op type is (");
+            ppIRType(t_arg1);
+            vex_printf(",");
+            ppIRType(t_arg2);
+            vex_printf(") -> ");
+            ppIRType (t_dst);
+            vex_printf("\narg tys are (");
+            ppIRType(ttarg1);
+            vex_printf(",");
+            ppIRType(ttarg2);
+            vex_printf(")\n");
+            sanityCheckFail(bb,stmt,
+               "Iex.Binop: arg tys don't match op tys\n"
+               "... additional details precede BB printout\n");
+         }
+         break;
+      }
+      case Iex_Unop:
+         tcExpr(bb,stmt, expr->Iex.Unop.arg, gWordTy );
+         typeOfPrimop(expr->Iex.Binop.op, 
+                      &t_dst, &t_arg1, &t_arg2, &t_arg3, &t_arg4);
+         if (t_arg1 == Ity_INVALID || t_arg2 != Ity_INVALID
+             || t_arg3 != Ity_INVALID || t_arg4 != Ity_INVALID)
+            sanityCheckFail(bb,stmt,"Iex.Unop: wrong arity op");
+         if (t_arg1 != typeOfIRExpr(tyenv, expr->Iex.Unop.arg))
+            sanityCheckFail(bb,stmt,"Iex.Unop: arg ty doesn't match op ty");
+         break;
+      case Iex_Load:
+         tcExpr(bb,stmt, expr->Iex.Load.addr, gWordTy);
+         if (typeOfIRExpr(tyenv, expr->Iex.Load.addr) != gWordTy)
+            sanityCheckFail(bb,stmt,"Iex.Load.addr: not :: guest word type");
+         if (expr->Iex.Load.end != Iend_LE && expr->Iex.Load.end != Iend_BE)
+            sanityCheckFail(bb,stmt,"Iex.Load.end: bogus endianness");
+         break;
+      case Iex_CCall:
+         if (!saneIRCallee(expr->Iex.CCall.cee))
+            sanityCheckFail(bb,stmt,"Iex.CCall.cee: bad IRCallee");
+         if (expr->Iex.CCall.cee->regparms > countArgs(expr->Iex.CCall.args)) 
+            sanityCheckFail(bb,stmt,"Iex.CCall.cee: #regparms > #args");
+         for (i = 0; expr->Iex.CCall.args[i]; i++) {
+            if (i >= 32)
+               sanityCheckFail(bb,stmt,"Iex.CCall: > 32 args");
+            tcExpr(bb,stmt, expr->Iex.CCall.args[i], gWordTy);
+         }
+         if (expr->Iex.CCall.retty == Ity_I1)
+            sanityCheckFail(bb,stmt,"Iex.CCall.retty: cannot return :: Ity_I1");
+         for (i = 0; expr->Iex.CCall.args[i]; i++)
+            if (typeOfIRExpr(tyenv, expr->Iex.CCall.args[i]) == Ity_I1)
+               sanityCheckFail(bb,stmt,"Iex.CCall.arg: arg :: Ity_I1");
+         break;
+      case Iex_Const:
+         if (!saneIRConst(expr->Iex.Const.con))
+            sanityCheckFail(bb,stmt,"Iex.Const.con: invalid const");
+         break;
+      case Iex_Mux0X:
+         tcExpr(bb,stmt, expr->Iex.Mux0X.cond, gWordTy);
+         tcExpr(bb,stmt, expr->Iex.Mux0X.expr0, gWordTy);
+         tcExpr(bb,stmt, expr->Iex.Mux0X.exprX, gWordTy);
+         if (typeOfIRExpr(tyenv, expr->Iex.Mux0X.cond) != Ity_I8)
+            sanityCheckFail(bb,stmt,"Iex.Mux0X.cond: cond :: Ity_I8");
+         if (typeOfIRExpr(tyenv, expr->Iex.Mux0X.expr0)
+             != typeOfIRExpr(tyenv, expr->Iex.Mux0X.exprX))
+            sanityCheckFail(bb,stmt,"Iex.Mux0X: expr0/exprX mismatch");
+         break;
+       default: 
+         vpanic("tcExpr");
+   }
+}
+
+
+static
+void tcStmt ( IRSB* bb, IRStmt* stmt, IRType gWordTy )
+{
+   Int        i;
+   IRDirty*   d;
+   IRCAS*     cas;
+   IRType     tyExpd, tyData;
+   IRTypeEnv* tyenv = bb->tyenv;
+   switch (stmt->tag) {
+      case Ist_IMark:
+         /* Somewhat heuristic, but rule out totally implausible
+            instruction sizes. */
+         if (stmt->Ist.IMark.len < 0 || stmt->Ist.IMark.len > 20)
+            sanityCheckFail(bb,stmt,"IRStmt.IMark.len: implausible");
+         break;
+      case Ist_AbiHint:
+         if (typeOfIRExpr(tyenv, stmt->Ist.AbiHint.base) != gWordTy)
+            sanityCheckFail(bb,stmt,"IRStmt.AbiHint.base: "
+                                    "not :: guest word type");
+         if (typeOfIRExpr(tyenv, stmt->Ist.AbiHint.nia) != gWordTy)
+            sanityCheckFail(bb,stmt,"IRStmt.AbiHint.nia: "
+                                    "not :: guest word type");
+         break;
+      case Ist_Put:
+         tcExpr( bb, stmt, stmt->Ist.Put.data, gWordTy );
+         if (typeOfIRExpr(tyenv,stmt->Ist.Put.data) == Ity_I1)
+            sanityCheckFail(bb,stmt,"IRStmt.Put.data: cannot Put :: Ity_I1");
+         break;
+      case Ist_PutI:
+         tcExpr( bb, stmt, stmt->Ist.PutI.data, gWordTy );
+         tcExpr( bb, stmt, stmt->Ist.PutI.ix, gWordTy );
+         if (typeOfIRExpr(tyenv,stmt->Ist.PutI.data) == Ity_I1)
+            sanityCheckFail(bb,stmt,"IRStmt.PutI.data: cannot PutI :: Ity_I1");
+         if (typeOfIRExpr(tyenv,stmt->Ist.PutI.data) 
+             != stmt->Ist.PutI.descr->elemTy)
+            sanityCheckFail(bb,stmt,"IRStmt.PutI.data: data ty != elem ty");
+         if (typeOfIRExpr(tyenv,stmt->Ist.PutI.ix) != Ity_I32)
+            sanityCheckFail(bb,stmt,"IRStmt.PutI.ix: not :: Ity_I32");
+         if (!saneIRRegArray(stmt->Ist.PutI.descr))
+            sanityCheckFail(bb,stmt,"IRStmt.PutI.descr: invalid descr");
+         break;
+      case Ist_WrTmp:
+         tcExpr( bb, stmt, stmt->Ist.WrTmp.data, gWordTy );
+         if (typeOfIRTemp(tyenv, stmt->Ist.WrTmp.tmp)
+             != typeOfIRExpr(tyenv, stmt->Ist.WrTmp.data))
+            sanityCheckFail(bb,stmt,"IRStmt.Put.Tmp: tmp and expr do not match");
+         break;
+      case Ist_Store:
+         tcExpr( bb, stmt, stmt->Ist.Store.addr, gWordTy );
+         tcExpr( bb, stmt, stmt->Ist.Store.data, gWordTy );
+         if (typeOfIRExpr(tyenv, stmt->Ist.Store.addr) != gWordTy)
+            sanityCheckFail(bb,stmt,"IRStmt.Store.addr: not :: guest word type");
+         if (typeOfIRExpr(tyenv, stmt->Ist.Store.data) == Ity_I1)
+            sanityCheckFail(bb,stmt,"IRStmt.Store.data: cannot Store :: Ity_I1");
+         if (stmt->Ist.Store.end != Iend_LE && stmt->Ist.Store.end != Iend_BE)
+            sanityCheckFail(bb,stmt,"Ist.Store.end: bogus endianness");
+         break;
+      case Ist_CAS:
+         cas = stmt->Ist.CAS.details;
+         /* make sure it's definitely either a CAS or a DCAS */
+         if (cas->oldHi == IRTemp_INVALID 
+             && cas->expdHi == NULL && cas->dataHi == NULL) {
+            /* fine; it's a single cas */
+         }
+         else
+         if (cas->oldHi != IRTemp_INVALID 
+             && cas->expdHi != NULL && cas->dataHi != NULL) {
+            /* fine; it's a double cas */
+         }
+         else {
+            /* it's some el-mutanto hybrid */
+            goto bad_cas;
+         }
+         /* check the address type */
+         tcExpr( bb, stmt, cas->addr, gWordTy );
+         if (typeOfIRExpr(tyenv, cas->addr) != gWordTy) goto bad_cas;
+         /* check types on the {old,expd,data}Lo components agree */
+         tyExpd = typeOfIRExpr(tyenv, cas->expdLo);
+         tyData = typeOfIRExpr(tyenv, cas->dataLo);
+         if (tyExpd != tyData) goto bad_cas;
+         if (tyExpd != typeOfIRTemp(tyenv, cas->oldLo))
+            goto bad_cas;
+         /* check the base element type is sane */
+         if (tyExpd == Ity_I8 || tyExpd == Ity_I16 || tyExpd == Ity_I32
+             || (gWordTy == Ity_I64 && tyExpd == Ity_I64)) {
+            /* fine */
+         } else {
+            goto bad_cas;
+         }
+         /* If it's a DCAS, check types on the {old,expd,data}Hi
+            components too */
+         if (cas->oldHi != IRTemp_INVALID) {
+            tyExpd = typeOfIRExpr(tyenv, cas->expdHi);
+            tyData = typeOfIRExpr(tyenv, cas->dataHi);
+            if (tyExpd != tyData) goto bad_cas;
+            if (tyExpd != typeOfIRTemp(tyenv, cas->oldHi))
+               goto bad_cas;
+            /* and finally check that oldLo and oldHi have the same
+               type.  This forces equivalence amongst all 6 types. */
+            if (typeOfIRTemp(tyenv, cas->oldHi)
+                != typeOfIRTemp(tyenv, cas->oldLo))
+               goto bad_cas;
+         }
+         break;
+         bad_cas:
+         sanityCheckFail(bb,stmt,"IRStmt.CAS: ill-formed");
+         break;
+      case Ist_LLSC: {
+         IRType tyRes;
+         if (typeOfIRExpr(tyenv, stmt->Ist.LLSC.addr) != gWordTy)
+            sanityCheckFail(bb,stmt,"IRStmt.LLSC.addr: not :: guest word type");
+         if (stmt->Ist.LLSC.end != Iend_LE && stmt->Ist.LLSC.end != Iend_BE)
+            sanityCheckFail(bb,stmt,"Ist.LLSC.end: bogus endianness");
+         tyRes = typeOfIRTemp(tyenv, stmt->Ist.LLSC.result);
+         if (stmt->Ist.LLSC.storedata == NULL) {
+            /* it's a LL */
+            if (tyRes != Ity_I64 && tyRes != Ity_I32 && tyRes != Ity_I8)
+               sanityCheckFail(bb,stmt,"Ist.LLSC(LL).result :: bogus");
+         } else {
+            /* it's a SC */
+            if (tyRes != Ity_I1)
+               sanityCheckFail(bb,stmt,"Ist.LLSC(SC).result: not :: Ity_I1");
+            tyData = typeOfIRExpr(tyenv, stmt->Ist.LLSC.storedata);
+            if (tyData != Ity_I64 && tyData != Ity_I32 && tyData != Ity_I8)
+               sanityCheckFail(bb,stmt,
+                               "Ist.LLSC(SC).result :: storedata bogus");
+         }
+         break;
+      }
+      case Ist_Dirty:
+         /* Mostly check for various kinds of ill-formed dirty calls. */
+         d = stmt->Ist.Dirty.details;
+         if (d->cee == NULL) goto bad_dirty;
+         if (!saneIRCallee(d->cee)) goto bad_dirty;
+         if (d->cee->regparms > countArgs(d->args)) goto bad_dirty;
+         if (d->mFx == Ifx_None) {
+            if (d->mAddr != NULL || d->mSize != 0)
+               goto bad_dirty;
+         } else {
+            if (d->mAddr == NULL || d->mSize == 0)
+               goto bad_dirty;
+         }
+         if (d->nFxState < 0 || d->nFxState > VEX_N_FXSTATE)
+            goto bad_dirty;
+         if (d->nFxState == 0 && d->needsBBP)
+            goto bad_dirty;
+         for (i = 0; i < d->nFxState; i++) {
+            if (d->fxState[i].fx == Ifx_None) goto bad_dirty;
+            if (d->fxState[i].size <= 0) goto bad_dirty;
+         }
+         /* check types, minimally */
+         if (d->guard == NULL) goto bad_dirty;
+         tcExpr( bb, stmt, d->guard, gWordTy );
+         if (typeOfIRExpr(tyenv, d->guard) != Ity_I1)
+            sanityCheckFail(bb,stmt,"IRStmt.Dirty.guard not :: Ity_I1");
+         if (d->tmp != IRTemp_INVALID
+             && typeOfIRTemp(tyenv, d->tmp) == Ity_I1)
+            sanityCheckFail(bb,stmt,"IRStmt.Dirty.dst :: Ity_I1");
+         for (i = 0; d->args[i] != NULL; i++) {
+            if (i >= 32)
+               sanityCheckFail(bb,stmt,"IRStmt.Dirty: > 32 args");
+            if (typeOfIRExpr(tyenv, d->args[i]) == Ity_I1)
+               sanityCheckFail(bb,stmt,"IRStmt.Dirty.arg[i] :: Ity_I1");
+         }
+         break;
+         bad_dirty:
+         sanityCheckFail(bb,stmt,"IRStmt.Dirty: ill-formed");
+         break;
+      case Ist_NoOp:
+         break;
+      case Ist_MBE:
+         switch (stmt->Ist.MBE.event) {
+            case Imbe_Fence:
+               break;
+            default: sanityCheckFail(bb,stmt,"IRStmt.MBE.event: unknown");
+               break;
+         }
+         break;
+      case Ist_Exit:
+         tcExpr( bb, stmt, stmt->Ist.Exit.guard, gWordTy );
+         if (typeOfIRExpr(tyenv,stmt->Ist.Exit.guard) != Ity_I1)
+            sanityCheckFail(bb,stmt,"IRStmt.Exit.guard: not :: Ity_I1");
+         if (!saneIRConst(stmt->Ist.Exit.dst))
+            sanityCheckFail(bb,stmt,"IRStmt.Exit.dst: bad dst");
+         if (typeOfIRConst(stmt->Ist.Exit.dst) != gWordTy)
+            sanityCheckFail(bb,stmt,"IRStmt.Exit.dst: not :: guest word type");
+         break;
+      default:
+         vpanic("tcStmt");
+   }
+}
+
+void sanityCheckIRSB ( IRSB* bb,          HChar* caller,
+                       Bool require_flat, IRType guest_word_size )
+{
+   Int     i;
+   IRStmt* stmt;
+   Int     n_temps    = bb->tyenv->types_used;
+   Int*    def_counts = LibVEX_Alloc(n_temps * sizeof(Int));
+
+   if (0)
+      vex_printf("sanityCheck: %s\n", caller);
+
+   vassert(guest_word_size == Ity_I32
+           || guest_word_size == Ity_I64);
+
+   if (bb->stmts_used < 0 || bb->stmts_size < 8
+       || bb->stmts_used > bb->stmts_size)
+      /* this BB is so strange we can't even print it */
+      vpanic("sanityCheckIRSB: stmts array limits wierd");
+
+   /* Ensure each temp has a plausible type. */
+   for (i = 0; i < n_temps; i++) {
+      IRType ty = typeOfIRTemp(bb->tyenv,(IRTemp)i);
+      if (!isPlausibleIRType(ty)) {
+         vex_printf("Temp t%d declared with implausible type 0x%x\n",
+                    i, (UInt)ty);
+         sanityCheckFail(bb,NULL,"Temp declared with implausible type");
+      }
+   }
+
+   /* Check for flatness, if required. */
+   if (require_flat) {
+      for (i = 0; i < bb->stmts_used; i++) {
+         stmt = bb->stmts[i];
+         if (!stmt)
+            sanityCheckFail(bb, stmt, "IRStmt: is NULL");
+         if (!isFlatIRStmt(stmt))
+            sanityCheckFail(bb, stmt, "IRStmt: is not flat");
+      }
+      if (!isIRAtom(bb->next))
+         sanityCheckFail(bb, NULL, "bb->next is not an atom");
+   }
+
+   /* Count the defs of each temp.  Only one def is allowed.
+      Also, check that each used temp has already been defd. */
+
+   for (i = 0; i < n_temps; i++)
+      def_counts[i] = 0;
+
+   for (i = 0; i < bb->stmts_used; i++) {
+      IRDirty* d;
+      IRCAS*   cas;
+      stmt = bb->stmts[i];
+      /* Check any temps used by this statement. */
+      useBeforeDef_Stmt(bb,stmt,def_counts);
+
+      /* Now make note of any temps defd by this statement. */
+      switch (stmt->tag) {
+      case Ist_WrTmp:
+         if (stmt->Ist.WrTmp.tmp < 0 || stmt->Ist.WrTmp.tmp >= n_temps)
+            sanityCheckFail(bb, stmt, 
+               "IRStmt.Tmp: destination tmp is out of range");
+         def_counts[stmt->Ist.WrTmp.tmp]++;
+         if (def_counts[stmt->Ist.WrTmp.tmp] > 1)
+            sanityCheckFail(bb, stmt, 
+               "IRStmt.Tmp: destination tmp is assigned more than once");
+         break;
+      case Ist_Store:
+         break;
+      case Ist_Dirty:
+         if (stmt->Ist.Dirty.details->tmp != IRTemp_INVALID) {
+            d = stmt->Ist.Dirty.details;
+            if (d->tmp < 0 || d->tmp >= n_temps)
+               sanityCheckFail(bb, stmt, 
+                  "IRStmt.Dirty: destination tmp is out of range");
+            def_counts[d->tmp]++;
+            if (def_counts[d->tmp] > 1)
+               sanityCheckFail(bb, stmt, 
+                  "IRStmt.Dirty: destination tmp is assigned more than once");
+         }
+         break;
+      case Ist_CAS:
+         cas = stmt->Ist.CAS.details;
+         if (cas->oldHi != IRTemp_INVALID) {
+            if (cas->oldHi < 0 || cas->oldHi >= n_temps)
+                sanityCheckFail(bb, stmt, 
+                   "IRStmt.CAS: destination tmpHi is out of range");
+             def_counts[cas->oldHi]++;
+             if (def_counts[cas->oldHi] > 1)
+                sanityCheckFail(bb, stmt, 
+                   "IRStmt.CAS: destination tmpHi is assigned more than once");
+         }
+         if (cas->oldLo < 0 || cas->oldLo >= n_temps)
+            sanityCheckFail(bb, stmt, 
+               "IRStmt.CAS: destination tmpLo is out of range");
+         def_counts[cas->oldLo]++;
+         if (def_counts[cas->oldLo] > 1)
+            sanityCheckFail(bb, stmt, 
+               "IRStmt.CAS: destination tmpLo is assigned more than once");
+         break;
+      case Ist_LLSC:
+         if (stmt->Ist.LLSC.result < 0 || stmt->Ist.LLSC.result >= n_temps)
+            sanityCheckFail(bb, stmt,
+               "IRStmt.LLSC: destination tmp is out of range");
+         def_counts[stmt->Ist.LLSC.result]++;
+         if (def_counts[stmt->Ist.LLSC.result] > 1)
+            sanityCheckFail(bb, stmt,
+               "IRStmt.LLSC: destination tmp is assigned more than once");
+         break;
+      default:
+         /* explicitly handle the rest, so as to keep gcc quiet */
+         break;
+      }
+   }
+
+   /* Typecheck everything. */
+   for (i = 0; i < bb->stmts_used; i++)
+      if (bb->stmts[i])
+         tcStmt( bb, bb->stmts[i], guest_word_size );
+   if (typeOfIRExpr(bb->tyenv,bb->next) != guest_word_size)
+      sanityCheckFail(bb, NULL, "bb->next field has wrong type");
+}
+
+/*---------------------------------------------------------------*/
+/*--- Misc helper functions                                   ---*/
+/*---------------------------------------------------------------*/
+
+Bool eqIRConst ( IRConst* c1, IRConst* c2 )
+{
+   if (c1->tag != c2->tag)
+      return False;
+
+   switch (c1->tag) {
+      case Ico_U1:  return toBool( (1 & c1->Ico.U1) == (1 & c2->Ico.U1) );
+      case Ico_U8:  return toBool( c1->Ico.U8  == c2->Ico.U8 );
+      case Ico_U16: return toBool( c1->Ico.U16 == c2->Ico.U16 );
+      case Ico_U32: return toBool( c1->Ico.U32 == c2->Ico.U32 );
+      case Ico_U64: return toBool( c1->Ico.U64 == c2->Ico.U64 );
+      case Ico_F64: return toBool( c1->Ico.F64 == c2->Ico.F64 );
+      case Ico_F64i: return toBool( c1->Ico.F64i == c2->Ico.F64i );
+      case Ico_V128: return toBool( c1->Ico.V128 == c2->Ico.V128 );
+      default: vpanic("eqIRConst");
+   }
+}
+
+Bool eqIRRegArray ( IRRegArray* descr1, IRRegArray* descr2 )
+{
+   return toBool( descr1->base == descr2->base 
+                  && descr1->elemTy == descr2->elemTy
+                  && descr1->nElems == descr2->nElems );
+}
+
+Int sizeofIRType ( IRType ty )
+{
+   switch (ty) {
+      case Ity_I8:   return 1;
+      case Ity_I16:  return 2;
+      case Ity_I32:  return 4;
+      case Ity_I64:  return 8;
+      case Ity_I128: return 16;
+      case Ity_F32:  return 4;
+      case Ity_F64:  return 8;
+      case Ity_V128: return 16;
+      default: vex_printf("\n"); ppIRType(ty); vex_printf("\n");
+               vpanic("sizeofIRType");
+   }
+}
+
+IRExpr* mkIRExpr_HWord ( HWord hw )
+{
+   vassert(sizeof(void*) == sizeof(HWord));
+   if (sizeof(HWord) == 4)
+      return IRExpr_Const(IRConst_U32((UInt)hw));
+   if (sizeof(HWord) == 8)
+      return IRExpr_Const(IRConst_U64((ULong)hw));
+   vpanic("mkIRExpr_HWord");
+}
+
+IRDirty* unsafeIRDirty_0_N ( Int regparms, HChar* name, void* addr, 
+                             IRExpr** args ) 
+{
+   IRDirty* d = emptyIRDirty();
+   d->cee   = mkIRCallee ( regparms, name, addr );
+   d->guard = IRExpr_Const(IRConst_U1(True));
+   d->args  = args;
+   return d;
+}
+
+IRDirty* unsafeIRDirty_1_N ( IRTemp dst, 
+                             Int regparms, HChar* name, void* addr, 
+                             IRExpr** args ) 
+{
+   IRDirty* d = emptyIRDirty();
+   d->cee   = mkIRCallee ( regparms, name, addr );
+   d->guard = IRExpr_Const(IRConst_U1(True));
+   d->args  = args;
+   d->tmp   = dst;
+   return d;
+}
+
+IRExpr* mkIRExprCCall ( IRType retty,
+                        Int regparms, HChar* name, void* addr, 
+                        IRExpr** args )
+{
+   return IRExpr_CCall ( mkIRCallee ( regparms, name, addr ), 
+                         retty, args );
+}
+
+Bool eqIRAtom ( IRExpr* a1, IRExpr* a2 )
+{
+   vassert(isIRAtom(a1));
+   vassert(isIRAtom(a2));
+   if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
+      return toBool(a1->Iex.RdTmp.tmp == a2->Iex.RdTmp.tmp);
+   if (a1->tag == Iex_Const && a2->tag == Iex_Const)
+      return eqIRConst(a1->Iex.Const.con, a2->Iex.Const.con);
+   return False;
+}
+
+/*---------------------------------------------------------------*/
+/*--- end                                           ir_defs.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/ir_match.c b/VEX/priv/ir_match.c
new file mode 100644
index 0000000..fc32f2e
--- /dev/null
+++ b/VEX/priv/ir_match.c

@@ -0,0 +1,111 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                        ir_match.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+/* Provides a facility for doing IR tree matching. */
+
+#include "main_util.h"
+#include "ir_match.h"
+
+
+/* Assign a value to a binder.  Checks for obvious stupidities. */
+
+static 
+void setBindee ( MatchInfo* mi, Int n, IRExpr* bindee )
+{
+   if (n < 0 || n >= N_IRMATCH_BINDERS)
+      vpanic("setBindee: out of range index");
+   if (mi->bindee[n] != NULL)
+      vpanic("setBindee: bindee already set");
+   mi->bindee[n] = bindee;
+}
+
+
+/* This is the actual matching function, recursing over the pattern
+   and expression trees in the obvious way, and dumping any matches
+   found into 'mi'. */
+
+static 
+Bool matchWrk ( MatchInfo* mi, IRExpr* p/*attern*/, IRExpr* e/*xpr*/ )
+{
+   switch (p->tag) {
+      case Iex_Binder: /* aha, what we were looking for. */
+         setBindee(mi, p->Iex.Binder.binder, e);
+         return True;
+      case Iex_Unop:
+         if (e->tag != Iex_Unop) return False;
+         if (p->Iex.Unop.op != e->Iex.Unop.op) return False;
+         if (!matchWrk(mi, p->Iex.Unop.arg, e->Iex.Unop.arg))
+            return False;
+         return True;
+      case Iex_Binop:
+         if (e->tag != Iex_Binop) return False;
+         if (p->Iex.Binop.op != e->Iex.Binop.op) return False;
+         if (!matchWrk(mi, p->Iex.Binop.arg1, e->Iex.Binop.arg1))
+            return False;
+         if (!matchWrk(mi, p->Iex.Binop.arg2, e->Iex.Binop.arg2))
+            return False;
+         return True;
+      case Iex_Load:
+         if (e->tag != Iex_Load) return False;
+         if (p->Iex.Load.end != e->Iex.Load.end) return False;
+         if (p->Iex.Load.ty != e->Iex.Load.ty) return False;
+         if (!matchWrk(mi, p->Iex.Load.addr, e->Iex.Load.addr))
+            return False;
+         return True;
+      case Iex_Const:
+         if (e->tag != Iex_Const) return False;
+         return eqIRConst(p->Iex.Const.con, e->Iex.Const.con);
+      default: 
+         ppIRExpr(p);
+         vpanic("match");
+   }
+}
+
+
+/* Top level entry point to the matcher. */
+
+Bool matchIRExpr ( MatchInfo* mi, IRExpr* p/*attern*/, IRExpr* e/*xpr*/ )
+{
+   Int i;
+   for (i = 0; i < N_IRMATCH_BINDERS; i++)
+      mi->bindee[i] = NULL;
+   return matchWrk(mi, p, e);
+}
+
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                          ir_match.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/ir_match.h b/VEX/priv/ir_match.h
new file mode 100644
index 0000000..5755505
--- /dev/null
+++ b/VEX/priv/ir_match.h

@@ -0,0 +1,90 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                        ir_match.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+/* Provides a facility for doing IR tree matching. */
+
+#ifndef __VEX_IR_MATCH_H
+#define __VEX_IR_MATCH_H
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+
+
+/* Patterns are simply IRExpr* trees, with IRExpr_Binder nodes at the
+   leaves, indicating binding points.  Use these magic macros to
+   declare and define patterns. */
+
+#define DECLARE_PATTERN(_patt) \
+   static IRExpr* _patt = NULL
+
+#define DEFINE_PATTERN(_patt,_expr)                            \
+   do {                                                        \
+      if (!(_patt)) {                                          \
+         vassert(vexGetAllocMode() == VexAllocModeTEMP);       \
+         vexSetAllocMode(VexAllocModePERM);                    \
+         _patt = (_expr);                                      \
+         vexSetAllocMode(VexAllocModeTEMP);                    \
+         vassert(vexGetAllocMode() == VexAllocModeTEMP);       \
+      }                                                        \
+   } while (0)
+
+
+/* This type returns the result of a match -- it records what
+   the binders got instantiated to. */
+
+#define N_IRMATCH_BINDERS 4
+
+typedef
+   struct {
+      IRExpr* bindee[N_IRMATCH_BINDERS];
+   }
+   MatchInfo;
+
+
+/* The matching function.  p is expected to have zero or more
+   IRExpr_Binds in it, numbered 0, 1, 2 ... Returns True if a match
+   succeeded. */
+
+extern
+Bool matchIRExpr ( MatchInfo* mi, IRExpr* p/*attern*/, IRExpr* e/*xpr*/ );
+
+
+#endif /* ndef __VEX_IR_MATCH_H */
+
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                          ir_match.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/ir_opt.c b/VEX/priv/ir_opt.c
new file mode 100644
index 0000000..4730680
--- /dev/null
+++ b/VEX/priv/ir_opt.c

@@ -0,0 +1,4671 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                          ir_opt.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+
+#include "main_util.h"
+#include "main_globals.h"
+#include "ir_opt.h"
+
+
+/* Set to 1 for lots of debugging output. */
+#define DEBUG_IROPT 0
+
+
+/* What iropt does, 29 Dec 04.
+
+   It takes an IRSB and produces a new one with the same meaning,
+   defined thus:
+
+   After execution of the new BB, all guest state and guest memory is
+   the same as after execution of the original.  This is true
+   regardless of how the block was exited (at the end vs side exit).
+
+   In addition, parts of the guest state will be identical to that
+   created by execution of the original at the following observation
+   points:
+
+   * In a dirty helper call, any parts of the guest state that the
+     helper states that it reads or modifies will be up to date.
+     Also, guest memory will be up to date.  Parts of the guest state
+     not marked as being read or modified by the helper cannot be
+     assumed to be up-to-date at the point where the helper is called.
+
+   * Immediately prior to any load or store, those parts of the guest
+     state marked as requiring precise exceptions will be up to date.
+     Also, guest memory will be up to date.  Parts of the guest state
+     not marked as requiring precise exceptions cannot be assumed to
+     be up-to-date at the point of the load/store.
+
+   The relative order of loads and stores (including loads/stores of
+   guest memory done by dirty helpers annotated as such) is not
+   changed.  However, the relative order of loads with no intervening
+   stores/modifies may be changed.  
+
+   Transformation order
+   ~~~~~~~~~~~~~~~~~~~~
+
+   There are three levels of optimisation, controlled by
+   vex_control.iropt_level.  Define first:
+
+   "Cheap transformations" are the following sequence:
+      * Redundant-Get removal
+      * Redundant-Put removal
+      * Constant propagation/folding
+      * Dead code removal
+      * Specialisation of clean helper functions
+      * Dead code removal
+
+   "Expensive transformations" are the following sequence:
+      * CSE
+      * Folding of add/sub chains
+      * Redundant-GetI removal
+      * Redundant-PutI removal
+      * Dead code removal
+
+   Then the transformations are as follows, as defined by
+   vex_control.iropt_level:
+
+   Level 0: 
+      * Flatten into atomic form.
+
+   Level 1: the following sequence:
+      * Flatten into atomic form.
+      * Cheap transformations.
+
+   Level 2: the following sequence
+      * Flatten into atomic form.
+      * Cheap transformations.
+      * If block contains any floating or vector types, CSE.
+      * If block contains GetI or PutI, Expensive transformations.
+      * Try unrolling loops.  Three possible outcomes:
+        - No effect: do nothing more.
+        - Unrolled a loop, and block does not contain GetI or PutI:
+          Do: * CSE
+              * Dead code removal
+        - Unrolled a loop, and block contains GetI or PutI:
+          Do: * Expensive transformations
+              * Cheap transformations
+*/
+
+/* Implementation notes, 29 Dec 04.
+
+   TODO (important): I think rPutI removal ignores precise exceptions
+   and is therefore in a sense, wrong.  In the sense that PutIs are
+   assumed not to write parts of the guest state that we need to have
+   up-to-date at loads/stores.  So far on x86 guest that has not
+   mattered since indeed only the x87 FP registers and tags are
+   accessed using GetI/PutI, and there is no need so far for them to
+   be up to date at mem exception points.  The rPutI pass should be
+   fixed.
+
+   TODO: improve pessimistic handling of precise exceptions
+     in the tree builder.
+
+   TODO: check interaction of rGetI and dirty helpers. 
+   
+   F64i constants are treated differently from other constants.
+   They are not regarded as atoms, and instead lifted off and
+   bound to temps.  This allows them to participate in CSE, which
+   is important for getting good performance for x86 guest code.
+
+   CSE up F64 literals (already doing F64is)
+
+   CSE: consider carefully the requirement for precise exns
+        prior to making CSE any more aggressive.  */
+
+
+/*---------------------------------------------------------------*/
+/*--- Finite mappery, of a sort                               ---*/
+/*---------------------------------------------------------------*/
+
+/* General map from HWord-sized thing HWord-sized thing.  Could be by
+   hashing, but it's not clear whether or not this would really be any
+   faster. */
+
+typedef
+   struct {
+      Bool*  inuse;
+      HWord* key;
+      HWord* val;
+      Int    size;
+      Int    used;
+   }
+   HashHW;
+
+static HashHW* newHHW ( void )
+{
+   HashHW* h = LibVEX_Alloc(sizeof(HashHW));
+   h->size   = 8;
+   h->used   = 0;
+   h->inuse  = LibVEX_Alloc(h->size * sizeof(Bool));
+   h->key    = LibVEX_Alloc(h->size * sizeof(HWord));
+   h->val    = LibVEX_Alloc(h->size * sizeof(HWord));
+   return h;
+}
+
+
+/* Look up key in the map. */
+
+static Bool lookupHHW ( HashHW* h, /*OUT*/HWord* val, HWord key )
+{
+   Int i;
+   /* vex_printf("lookupHHW(%llx)\n", key ); */
+   for (i = 0; i < h->used; i++) {
+      if (h->inuse[i] && h->key[i] == key) {
+         if (val)
+            *val = h->val[i];
+         return True;
+      }
+   }
+   return False;
+}
+
+
+/* Add key->val to the map.  Replaces any existing binding for key. */
+
+static void addToHHW ( HashHW* h, HWord key, HWord val )
+{
+   Int i, j;
+   /* vex_printf("addToHHW(%llx, %llx)\n", key, val); */
+
+   /* Find and replace existing binding, if any. */
+   for (i = 0; i < h->used; i++) {
+      if (h->inuse[i] && h->key[i] == key) {
+         h->val[i] = val;
+         return;
+      }
+   }
+
+   /* Ensure a space is available. */
+   if (h->used == h->size) {
+      /* Copy into arrays twice the size. */
+      Bool*  inuse2 = LibVEX_Alloc(2 * h->size * sizeof(Bool));
+      HWord* key2   = LibVEX_Alloc(2 * h->size * sizeof(HWord));
+      HWord* val2   = LibVEX_Alloc(2 * h->size * sizeof(HWord));
+      for (i = j = 0; i < h->size; i++) {
+         if (!h->inuse[i]) continue;
+         inuse2[j] = True;
+         key2[j] = h->key[i];
+         val2[j] = h->val[i];
+         j++;
+      }
+      h->used = j;
+      h->size *= 2;
+      h->inuse = inuse2;
+      h->key = key2;
+      h->val = val2;
+   }
+
+   /* Finally, add it. */
+   vassert(h->used < h->size);
+   h->inuse[h->used] = True;
+   h->key[h->used] = key;
+   h->val[h->used] = val;
+   h->used++;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Flattening out a BB into atomic SSA form                ---*/
+/*---------------------------------------------------------------*/
+
+/* Non-critical helper, heuristic for reducing the number of tmp-tmp
+   copies made by flattening.  If in doubt return False. */
+
+static Bool isFlat ( IRExpr* e )
+{
+   if (e->tag == Iex_Get) 
+      return True;
+   if (e->tag == Iex_Binop)
+      return toBool( isIRAtom(e->Iex.Binop.arg1) 
+                     && isIRAtom(e->Iex.Binop.arg2) );
+   if (e->tag == Iex_Load)
+      return isIRAtom(e->Iex.Load.addr);
+   return False;
+}
+
+/* Flatten out 'ex' so it is atomic, returning a new expression with
+   the same value, after having appended extra IRTemp assignments to
+   the end of 'bb'. */
+
+static IRExpr* flatten_Expr ( IRSB* bb, IRExpr* ex )
+{
+   Int i;
+   IRExpr** newargs;
+   IRType ty = typeOfIRExpr(bb->tyenv, ex);
+   IRTemp t1;
+
+   switch (ex->tag) {
+
+      case Iex_GetI:
+         t1 = newIRTemp(bb->tyenv, ty);
+         addStmtToIRSB(bb, IRStmt_WrTmp(t1,
+            IRExpr_GetI(ex->Iex.GetI.descr,
+                        flatten_Expr(bb, ex->Iex.GetI.ix),
+                        ex->Iex.GetI.bias)));
+         return IRExpr_RdTmp(t1);
+
+      case Iex_Get:
+         t1 = newIRTemp(bb->tyenv, ty);
+         addStmtToIRSB(bb, 
+            IRStmt_WrTmp(t1, ex));
+         return IRExpr_RdTmp(t1);
+
+      case Iex_Qop:
+         t1 = newIRTemp(bb->tyenv, ty);
+         addStmtToIRSB(bb, IRStmt_WrTmp(t1, 
+            IRExpr_Qop(ex->Iex.Qop.op,
+                         flatten_Expr(bb, ex->Iex.Qop.arg1),
+                         flatten_Expr(bb, ex->Iex.Qop.arg2),
+                         flatten_Expr(bb, ex->Iex.Qop.arg3),
+                         flatten_Expr(bb, ex->Iex.Qop.arg4))));
+         return IRExpr_RdTmp(t1);
+
+      case Iex_Triop:
+         t1 = newIRTemp(bb->tyenv, ty);
+         addStmtToIRSB(bb, IRStmt_WrTmp(t1, 
+            IRExpr_Triop(ex->Iex.Triop.op,
+                         flatten_Expr(bb, ex->Iex.Triop.arg1),
+                         flatten_Expr(bb, ex->Iex.Triop.arg2),
+                         flatten_Expr(bb, ex->Iex.Triop.arg3))));
+         return IRExpr_RdTmp(t1);
+
+      case Iex_Binop:
+         t1 = newIRTemp(bb->tyenv, ty);
+         addStmtToIRSB(bb, IRStmt_WrTmp(t1, 
+            IRExpr_Binop(ex->Iex.Binop.op,
+                         flatten_Expr(bb, ex->Iex.Binop.arg1),
+                         flatten_Expr(bb, ex->Iex.Binop.arg2))));
+         return IRExpr_RdTmp(t1);
+
+      case Iex_Unop:
+         t1 = newIRTemp(bb->tyenv, ty);
+         addStmtToIRSB(bb, IRStmt_WrTmp(t1, 
+            IRExpr_Unop(ex->Iex.Unop.op,
+                        flatten_Expr(bb, ex->Iex.Unop.arg))));
+         return IRExpr_RdTmp(t1);
+
+      case Iex_Load:
+         t1 = newIRTemp(bb->tyenv, ty);
+         addStmtToIRSB(bb, IRStmt_WrTmp(t1,
+            IRExpr_Load(ex->Iex.Load.end,
+                        ex->Iex.Load.ty, 
+                        flatten_Expr(bb, ex->Iex.Load.addr))));
+         return IRExpr_RdTmp(t1);
+
+      case Iex_CCall:
+         newargs = shallowCopyIRExprVec(ex->Iex.CCall.args);
+         for (i = 0; newargs[i]; i++)
+            newargs[i] = flatten_Expr(bb, newargs[i]);
+         t1 = newIRTemp(bb->tyenv, ty);
+         addStmtToIRSB(bb, IRStmt_WrTmp(t1,
+            IRExpr_CCall(ex->Iex.CCall.cee,
+                         ex->Iex.CCall.retty,
+                         newargs)));
+         return IRExpr_RdTmp(t1);
+
+      case Iex_Mux0X:
+         t1 = newIRTemp(bb->tyenv, ty);
+         addStmtToIRSB(bb, IRStmt_WrTmp(t1,
+            IRExpr_Mux0X(flatten_Expr(bb, ex->Iex.Mux0X.cond),
+                         flatten_Expr(bb, ex->Iex.Mux0X.expr0),
+                         flatten_Expr(bb, ex->Iex.Mux0X.exprX))));
+         return IRExpr_RdTmp(t1);
+
+      case Iex_Const:
+         /* Lift F64i constants out onto temps so they can be CSEd
+            later. */
+         if (ex->Iex.Const.con->tag == Ico_F64i) {
+            t1 = newIRTemp(bb->tyenv, ty);
+            addStmtToIRSB(bb, IRStmt_WrTmp(t1,
+               IRExpr_Const(ex->Iex.Const.con)));
+            return IRExpr_RdTmp(t1);
+         } else {
+            /* Leave all other constants alone. */
+            return ex;
+         }
+
+      case Iex_RdTmp:
+         return ex;
+
+      default:
+         vex_printf("\n");
+         ppIRExpr(ex); 
+         vex_printf("\n");
+         vpanic("flatten_Expr");
+   }
+}
+
+
+/* Append a completely flattened form of 'st' to the end of 'bb'. */
+
+static void flatten_Stmt ( IRSB* bb, IRStmt* st )
+{
+   Int i;
+   IRExpr  *e1, *e2, *e3, *e4, *e5;
+   IRDirty *d,  *d2;
+   IRCAS   *cas, *cas2;
+   switch (st->tag) {
+      case Ist_Put:
+         if (isIRAtom(st->Ist.Put.data)) {
+            /* optimisation to reduce the amount of heap wasted
+               by the flattener */
+            addStmtToIRSB(bb, st);
+         } else {
+            /* general case, always correct */
+            e1 = flatten_Expr(bb, st->Ist.Put.data);
+            addStmtToIRSB(bb, IRStmt_Put(st->Ist.Put.offset, e1));
+         }
+         break;
+      case Ist_PutI:
+         e1 = flatten_Expr(bb, st->Ist.PutI.ix);
+         e2 = flatten_Expr(bb, st->Ist.PutI.data);
+         addStmtToIRSB(bb, IRStmt_PutI(st->Ist.PutI.descr,
+                                       e1,
+                                       st->Ist.PutI.bias,
+                                       e2));
+         break;
+      case Ist_WrTmp:
+         if (isFlat(st->Ist.WrTmp.data)) {
+            /* optimisation, to reduce the number of tmp-tmp
+               copies generated */
+            addStmtToIRSB(bb, st);
+         } else {
+            /* general case, always correct */
+            e1 = flatten_Expr(bb, st->Ist.WrTmp.data);
+            addStmtToIRSB(bb, IRStmt_WrTmp(st->Ist.WrTmp.tmp, e1));
+         }
+         break;
+      case Ist_Store:
+         e1 = flatten_Expr(bb, st->Ist.Store.addr);
+         e2 = flatten_Expr(bb, st->Ist.Store.data);
+         addStmtToIRSB(bb, IRStmt_Store(st->Ist.Store.end, e1,e2));
+         break;
+      case Ist_CAS:
+         cas  = st->Ist.CAS.details;
+         e1   = flatten_Expr(bb, cas->addr);
+         e2   = cas->expdHi ? flatten_Expr(bb, cas->expdHi) : NULL;
+         e3   = flatten_Expr(bb, cas->expdLo);
+         e4   = cas->dataHi ? flatten_Expr(bb, cas->dataHi) : NULL;
+         e5   = flatten_Expr(bb, cas->dataLo);
+         cas2 = mkIRCAS( cas->oldHi, cas->oldLo, cas->end,
+                         e1, e2, e3, e4, e5 );
+         addStmtToIRSB(bb, IRStmt_CAS(cas2));
+         break;
+      case Ist_LLSC:
+         e1 = flatten_Expr(bb, st->Ist.LLSC.addr);
+         e2 = st->Ist.LLSC.storedata
+                 ? flatten_Expr(bb, st->Ist.LLSC.storedata)
+                 : NULL;
+         addStmtToIRSB(bb, IRStmt_LLSC(st->Ist.LLSC.end,
+                                       st->Ist.LLSC.result, e1, e2));
+         break;
+      case Ist_Dirty:
+         d = st->Ist.Dirty.details;
+         d2 = emptyIRDirty();
+         *d2 = *d;
+         d2->args = shallowCopyIRExprVec(d2->args);
+         if (d2->mFx != Ifx_None) {
+            d2->mAddr = flatten_Expr(bb, d2->mAddr);
+         } else {
+            vassert(d2->mAddr == NULL);
+         }
+         d2->guard = flatten_Expr(bb, d2->guard);
+         for (i = 0; d2->args[i]; i++)
+            d2->args[i] = flatten_Expr(bb, d2->args[i]);
+         addStmtToIRSB(bb, IRStmt_Dirty(d2));
+         break;
+      case Ist_NoOp:
+      case Ist_MBE:
+      case Ist_IMark:
+         addStmtToIRSB(bb, st);
+         break;
+      case Ist_AbiHint:
+         e1 = flatten_Expr(bb, st->Ist.AbiHint.base);
+         e2 = flatten_Expr(bb, st->Ist.AbiHint.nia);
+         addStmtToIRSB(bb, IRStmt_AbiHint(e1, st->Ist.AbiHint.len, e2));
+         break;
+      case Ist_Exit:
+         e1 = flatten_Expr(bb, st->Ist.Exit.guard);
+         addStmtToIRSB(bb, IRStmt_Exit(e1, st->Ist.Exit.jk,
+                                           st->Ist.Exit.dst));
+         break;
+      default:
+         vex_printf("\n");
+         ppIRStmt(st); 
+         vex_printf("\n");
+         vpanic("flatten_Stmt");
+   }
+}
+
+
+static IRSB* flatten_BB ( IRSB* in )
+{
+   Int   i;
+   IRSB* out;
+   out = emptyIRSB();
+   out->tyenv = deepCopyIRTypeEnv( in->tyenv );
+   for (i = 0; i < in->stmts_used; i++)
+      if (in->stmts[i])
+         flatten_Stmt( out, in->stmts[i] );
+   out->next     = flatten_Expr( out, in->next );
+   out->jumpkind = in->jumpkind;
+   return out;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- In-place removal of redundant GETs                      ---*/
+/*---------------------------------------------------------------*/
+
+/* Scan forwards, building up an environment binding (min offset, max
+   offset) pairs to values, which will either be temps or constants.
+
+   On seeing 't = Get(minoff,maxoff)', look up (minoff,maxoff) in the
+   env and if it matches, replace the Get with the stored value.  If
+   there is no match, add a (minoff,maxoff) :-> t binding.
+
+   On seeing 'Put (minoff,maxoff) = t or c', first remove in the env
+   any binding which fully or partially overlaps with (minoff,maxoff).
+   Then add a new (minoff,maxoff) :-> t or c binding.  */
+
+/* Extract the min/max offsets from a guest state array descriptor. */
+
+inline
+static void getArrayBounds ( IRRegArray* descr, 
+                             UInt* minoff, UInt* maxoff )
+{
+   *minoff = descr->base;
+   *maxoff = *minoff + descr->nElems*sizeofIRType(descr->elemTy) - 1;
+   vassert((*minoff & ~0xFFFF) == 0);
+   vassert((*maxoff & ~0xFFFF) == 0);
+   vassert(*minoff <= *maxoff);
+}
+
+/* Create keys, of the form ((minoffset << 16) | maxoffset). */
+
+static UInt mk_key_GetPut ( Int offset, IRType ty )
+{
+   /* offset should fit in 16 bits. */
+   UInt minoff = offset;
+   UInt maxoff = minoff + sizeofIRType(ty) - 1;
+   vassert((minoff & ~0xFFFF) == 0);
+   vassert((maxoff & ~0xFFFF) == 0);
+   return (minoff << 16) | maxoff;
+}
+
+static UInt mk_key_GetIPutI ( IRRegArray* descr )
+{
+   UInt minoff, maxoff;
+   getArrayBounds( descr, &minoff, &maxoff );
+   vassert((minoff & ~0xFFFF) == 0);
+   vassert((maxoff & ~0xFFFF) == 0);
+   return (minoff << 16) | maxoff;
+}
+
+/* Supposing h has keys of the form generated by mk_key_GetPut and
+   mk_key_GetIPutI, invalidate any key which overlaps (k_lo
+   .. k_hi). 
+*/
+static void invalidateOverlaps ( HashHW* h, UInt k_lo, UInt k_hi )
+{
+   Int  j;
+   UInt e_lo, e_hi;
+   vassert(k_lo <= k_hi);
+   /* invalidate any env entries which in any way overlap (k_lo
+      .. k_hi) */
+   /* vex_printf("invalidate %d .. %d\n", k_lo, k_hi ); */
+
+   for (j = 0; j < h->used; j++) {
+      if (!h->inuse[j]) 
+         continue;
+      e_lo = (((UInt)h->key[j]) >> 16) & 0xFFFF;
+      e_hi = ((UInt)h->key[j]) & 0xFFFF;
+      vassert(e_lo <= e_hi);
+      if (e_hi < k_lo || k_hi < e_lo)
+         continue; /* no overlap possible */
+      else
+         /* overlap; invalidate */
+         h->inuse[j] = False;
+   }
+}
+
+
+static void redundant_get_removal_BB ( IRSB* bb )
+{
+   HashHW* env = newHHW();
+   UInt    key = 0; /* keep gcc -O happy */
+   Int     i, j;
+   HWord   val;
+
+   for (i = 0; i < bb->stmts_used; i++) {
+      IRStmt* st = bb->stmts[i];
+
+      if (st->tag == Ist_NoOp)
+         continue;
+
+      /* Deal with Gets */
+      if (st->tag == Ist_WrTmp
+          && st->Ist.WrTmp.data->tag == Iex_Get) {
+         /* st is 't = Get(...)'.  Look up in the environment and see
+            if the Get can be replaced. */
+         IRExpr* get = st->Ist.WrTmp.data;
+         key = (HWord)mk_key_GetPut( get->Iex.Get.offset, 
+                                     get->Iex.Get.ty );
+         if (lookupHHW(env, &val, (HWord)key)) {
+            /* found it */
+            /* Note, we could do better here.  If the types are
+               different we don't do the substitution, since doing so
+               could lead to invalidly-typed IR.  An improvement would
+               be to stick in a reinterpret-style cast, although that
+               would make maintaining flatness more difficult. */
+            IRExpr* valE    = (IRExpr*)val;
+            Bool    typesOK = toBool( typeOfIRExpr(bb->tyenv,valE) 
+                                      == st->Ist.WrTmp.data->Iex.Get.ty );
+            if (typesOK && DEBUG_IROPT) {
+               vex_printf("rGET: "); ppIRExpr(get);
+               vex_printf("  ->  "); ppIRExpr(valE);
+               vex_printf("\n");
+            }
+            if (typesOK)
+               bb->stmts[i] = IRStmt_WrTmp(st->Ist.WrTmp.tmp, valE);
+         } else {
+            /* Not found, but at least we know that t and the Get(...)
+               are now associated.  So add a binding to reflect that
+               fact. */
+            addToHHW( env, (HWord)key, 
+                           (HWord)(void*)(IRExpr_RdTmp(st->Ist.WrTmp.tmp)) );
+         }
+      }
+
+      /* Deal with Puts: invalidate any env entries overlapped by this
+         Put */
+      if (st->tag == Ist_Put || st->tag == Ist_PutI) {
+         UInt k_lo, k_hi;
+         if (st->tag == Ist_Put) {
+            key = mk_key_GetPut( st->Ist.Put.offset, 
+                                 typeOfIRExpr(bb->tyenv,st->Ist.Put.data) );
+         } else {
+            vassert(st->tag == Ist_PutI);
+            key = mk_key_GetIPutI( st->Ist.PutI.descr );
+         }
+
+         k_lo = (key >> 16) & 0xFFFF;
+         k_hi = key & 0xFFFF;
+         invalidateOverlaps(env, k_lo, k_hi);
+      }
+      else
+      if (st->tag == Ist_Dirty) {
+         /* Deal with dirty helpers which write or modify guest state.
+            Invalidate the entire env.  We could do a lot better
+            here. */
+         IRDirty* d      = st->Ist.Dirty.details;
+         Bool     writes = False;
+         for (j = 0; j < d->nFxState; j++) {
+            if (d->fxState[j].fx == Ifx_Modify 
+                || d->fxState[j].fx == Ifx_Write)
+            writes = True;
+         }
+         if (writes) {
+            /* dump the entire env (not clever, but correct ...) */
+            for (j = 0; j < env->used; j++)
+               env->inuse[j] = False;
+            if (0) vex_printf("rGET: trash env due to dirty helper\n");
+         }
+      }
+
+      /* add this one to the env, if appropriate */
+      if (st->tag == Ist_Put) {
+         vassert(isIRAtom(st->Ist.Put.data));
+         addToHHW( env, (HWord)key, (HWord)(st->Ist.Put.data));
+      }
+
+   } /* for (i = 0; i < bb->stmts_used; i++) */
+
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- In-place removal of redundant PUTs                      ---*/
+/*---------------------------------------------------------------*/
+
+/* Find any Get uses in st and invalidate any partially or fully
+   overlapping ranges listed in env.  Due to the flattening phase, the
+   only stmt kind we expect to find a Get on is IRStmt_WrTmp. */
+
+static void handle_gets_Stmt ( 
+               HashHW* env, 
+               IRStmt* st,
+               Bool (*preciseMemExnsFn)(Int,Int)
+            )
+{
+   Int     j;
+   UInt    key = 0; /* keep gcc -O happy */
+   Bool    isGet;
+   Bool    memRW = False;
+   IRExpr* e;
+
+   switch (st->tag) {
+
+      /* This is the only interesting case.  Deal with Gets in the RHS
+         expression. */
+      case Ist_WrTmp:
+         e = st->Ist.WrTmp.data;
+         switch (e->tag) {
+            case Iex_Get:
+               isGet = True;
+               key = mk_key_GetPut ( e->Iex.Get.offset, e->Iex.Get.ty );
+               break;
+            case Iex_GetI:
+               isGet = True;
+               key = mk_key_GetIPutI ( e->Iex.GetI.descr );
+               break;
+            case Iex_Load:
+               isGet = False;
+               memRW = True;
+               break;
+            default: 
+               isGet = False;
+         }
+         if (isGet) {
+            UInt k_lo, k_hi;
+            k_lo = (key >> 16) & 0xFFFF;
+            k_hi = key & 0xFFFF;
+            invalidateOverlaps(env, k_lo, k_hi);
+         }
+         break;
+
+      /* Be very conservative for dirty helper calls; dump the entire
+         environment.  The helper might read guest state, in which
+         case it needs to be flushed first.  Also, the helper might
+         access guest memory, in which case all parts of the guest
+         state requiring precise exceptions needs to be flushed.  The
+         crude solution is just to flush everything; we could easily
+         enough do a lot better if needed. */
+      /* Probably also overly-conservative, but also dump everything
+         if we hit a memory bus event (fence, lock, unlock).  Ditto
+         AbiHints, CASs, LLs and SCs. */
+      case Ist_AbiHint:
+         vassert(isIRAtom(st->Ist.AbiHint.base));
+         vassert(isIRAtom(st->Ist.AbiHint.nia));
+         /* fall through */
+      case Ist_MBE:
+      case Ist_Dirty:
+      case Ist_CAS:
+      case Ist_LLSC:
+         for (j = 0; j < env->used; j++)
+            env->inuse[j] = False;
+         break;
+
+      /* all other cases are boring. */
+      case Ist_Store:
+         vassert(isIRAtom(st->Ist.Store.addr));
+         vassert(isIRAtom(st->Ist.Store.data));
+         memRW = True;
+         break;
+
+      case Ist_Exit:
+         vassert(isIRAtom(st->Ist.Exit.guard));
+         break;
+
+      case Ist_PutI:
+         vassert(isIRAtom(st->Ist.PutI.ix));
+         vassert(isIRAtom(st->Ist.PutI.data));
+         break;
+
+      case Ist_NoOp:
+      case Ist_IMark:
+         break;
+
+      default:
+         vex_printf("\n");
+         ppIRStmt(st);
+         vex_printf("\n");
+         vpanic("handle_gets_Stmt");
+   }
+
+   if (memRW) {
+      /* This statement accesses memory.  So we need to dump all parts
+         of the environment corresponding to guest state that may not
+         be reordered with respect to memory references.  That means
+         at least the stack pointer. */
+      for (j = 0; j < env->used; j++) {
+         if (!env->inuse[j])
+            continue;
+         if (vex_control.iropt_precise_memory_exns) {
+            /* Precise exceptions required.  Flush all guest state. */
+            env->inuse[j] = False;
+         } else {
+            /* Just flush the minimal amount required, as computed by
+               preciseMemExnsFn. */
+            HWord k_lo = (env->key[j] >> 16) & 0xFFFF;
+            HWord k_hi = env->key[j] & 0xFFFF;
+            if (preciseMemExnsFn( k_lo, k_hi ))
+               env->inuse[j] = False;
+         }
+      }
+   } /* if (memRW) */
+
+}
+
+
+/* Scan backwards, building up a set of (min offset, max
+   offset) pairs, indicating those parts of the guest state
+   for which the next event is a write.
+
+   On seeing a conditional exit, empty the set.
+
+   On seeing 'Put (minoff,maxoff) = t or c', if (minoff,maxoff) is
+   completely within the set, remove the Put.  Otherwise, add
+   (minoff,maxoff) to the set.
+
+   On seeing 'Get (minoff,maxoff)', remove any part of the set
+   overlapping (minoff,maxoff).  The same has to happen for any events
+   which implicitly read parts of the guest state: dirty helper calls
+   and loads/stores.
+*/
+
+static void redundant_put_removal_BB ( 
+               IRSB* bb,
+               Bool (*preciseMemExnsFn)(Int,Int)
+            )
+{
+   Int     i, j;
+   Bool    isPut;
+   IRStmt* st;
+   UInt    key = 0; /* keep gcc -O happy */
+
+   HashHW* env = newHHW();
+   for (i = bb->stmts_used-1; i >= 0; i--) {
+      st = bb->stmts[i];
+
+      if (st->tag == Ist_NoOp)
+         continue;
+
+      /* Deal with conditional exits. */
+      if (st->tag == Ist_Exit) {
+         /* Since control may not get beyond this point, we must empty
+            out the set, since we can no longer claim that the next
+            event for any part of the guest state is definitely a
+            write. */
+         vassert(isIRAtom(st->Ist.Exit.guard));
+         for (j = 0; j < env->used; j++)
+            env->inuse[j] = False;
+         continue;
+      }
+
+      /* Deal with Puts */
+      switch (st->tag) {
+         case Ist_Put: 
+            isPut = True;
+            key = mk_key_GetPut( st->Ist.Put.offset, 
+                                 typeOfIRExpr(bb->tyenv,st->Ist.Put.data) );
+            vassert(isIRAtom(st->Ist.Put.data));
+            break;
+         case Ist_PutI:
+            isPut = True;
+            key = mk_key_GetIPutI( st->Ist.PutI.descr );
+            vassert(isIRAtom(st->Ist.PutI.ix));
+            vassert(isIRAtom(st->Ist.PutI.data));
+            break;
+         default: 
+            isPut = False;
+      }
+      if (isPut && st->tag != Ist_PutI) {
+         /* See if any single entry in env overlaps this Put.  This is
+            simplistic in that the transformation is valid if, say, two
+            or more entries in the env overlap this Put, but the use of
+            lookupHHW will only find a single entry which exactly
+            overlaps this Put.  This is suboptimal but safe. */
+         if (lookupHHW(env, NULL, (HWord)key)) {
+            /* This Put is redundant because a later one will overwrite
+               it.  So NULL (nop) it out. */
+            if (DEBUG_IROPT) {
+               vex_printf("rPUT: "); ppIRStmt(st);
+               vex_printf("\n");
+            }
+            bb->stmts[i] = IRStmt_NoOp();
+         } else {
+            /* We can't demonstrate that this Put is redundant, so add it
+               to the running collection. */
+            addToHHW(env, (HWord)key, 0);
+         }
+         continue;
+      }
+
+      /* Deal with Gets.  These remove bits of the environment since
+         appearance of a Get means that the next event for that slice
+         of the guest state is no longer a write, but a read.  Also
+         deals with implicit reads of guest state needed to maintain
+         precise exceptions. */
+      handle_gets_Stmt( env, st, preciseMemExnsFn );
+   }
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Constant propagation and folding                        ---*/
+/*---------------------------------------------------------------*/
+
+/* The env in this section is a map from IRTemp to IRExpr*,
+   that is, an array indexed by IRTemp. */
+
+/* Are both expressions simply the same IRTemp ? */
+static Bool sameIRTemps ( IRExpr* e1, IRExpr* e2 )
+{
+   return toBool( e1->tag == Iex_RdTmp
+                  && e2->tag == Iex_RdTmp
+                  && e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp );
+}
+
+static Bool sameIcoU32s ( IRExpr* e1, IRExpr* e2 )
+{
+   return toBool( e1->tag == Iex_Const
+                  && e2->tag == Iex_Const
+                  && e1->Iex.Const.con->tag == Ico_U32
+                  && e2->Iex.Const.con->tag == Ico_U32
+                  && e1->Iex.Const.con->Ico.U32
+                     == e2->Iex.Const.con->Ico.U32 );
+}
+
+/* Are both expressions either the same IRTemp or IRConst-U32s ?  If
+   in doubt, say No. */
+static Bool sameIRTempsOrIcoU32s ( IRExpr* e1, IRExpr* e2 )
+{
+   switch (e1->tag) {
+      case Iex_RdTmp:
+         return sameIRTemps(e1, e2);
+      case Iex_Const:
+         return sameIcoU32s(e1, e2);
+      default:
+         return False;
+   }
+}
+
+static Bool notBool ( Bool b )
+{
+   if (b == True) return False;
+   if (b == False) return True;
+   vpanic("notBool");
+}
+
+/* Make a zero which has the same type as the result of the given
+   primop. */
+static IRExpr* mkZeroOfPrimopResultType ( IROp op )
+{
+   switch (op) {
+      case Iop_Xor8:  return IRExpr_Const(IRConst_U8(0));
+      case Iop_Xor16: return IRExpr_Const(IRConst_U16(0));
+      case Iop_Sub32:
+      case Iop_Xor32: return IRExpr_Const(IRConst_U32(0));
+      case Iop_Sub64:
+      case Iop_Xor64: return IRExpr_Const(IRConst_U64(0));
+      case Iop_XorV128: return IRExpr_Const(IRConst_V128(0));
+      default: vpanic("mkZeroOfPrimopResultType: bad primop");
+   }
+}
+
+/* Make a value containing all 1-bits, which has the same type as the
+   result of the given primop. */
+static IRExpr* mkOnesOfPrimopResultType ( IROp op )
+{
+   switch (op) {
+      case Iop_CmpEQ64:
+         return IRExpr_Const(IRConst_U1(toBool(1)));
+      case Iop_CmpEQ8x8:
+         return IRExpr_Const(IRConst_U64(0xFFFFFFFFFFFFFFFFULL));
+      case Iop_CmpEQ8x16:
+         return IRExpr_Const(IRConst_V128(0xFFFF));
+      default:
+         vpanic("mkOnesOfPrimopResultType: bad primop");
+   }
+}
+
+
+static IRExpr* fold_Expr ( IRExpr* e )
+{
+   Int     shift;
+   IRExpr* e2 = e; /* e2 is the result of folding e, if possible */
+
+   /* UNARY ops */
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.arg->tag == Iex_Const) {
+      switch (e->Iex.Unop.op) {
+         case Iop_1Uto8:
+            e2 = IRExpr_Const(IRConst_U8(toUChar(
+                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
+                    ? 1 : 0)));
+            break;
+         case Iop_1Uto32:
+            e2 = IRExpr_Const(IRConst_U32(
+                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
+                    ? 1 : 0));
+            break;
+         case Iop_1Uto64:
+            e2 = IRExpr_Const(IRConst_U64(
+                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
+                    ? 1 : 0));
+            break;
+
+         case Iop_1Sto8:
+            e2 = IRExpr_Const(IRConst_U8(toUChar(
+                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
+                    ? 0xFF : 0)));
+            break;
+         case Iop_1Sto16:
+            e2 = IRExpr_Const(IRConst_U16(toUShort(
+                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
+                    ? 0xFFFF : 0)));
+            break;
+         case Iop_1Sto32:
+            e2 = IRExpr_Const(IRConst_U32(
+                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
+                    ? 0xFFFFFFFF : 0));
+            break;
+         case Iop_1Sto64:
+            e2 = IRExpr_Const(IRConst_U64(
+                    e->Iex.Unop.arg->Iex.Const.con->Ico.U1
+                    ? 0xFFFFFFFFFFFFFFFFULL : 0));
+            break;
+
+         case Iop_8Sto32: {
+            /* signed */ Int s32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U8;
+            s32 <<= 24;
+            s32 >>= 24;
+            e2 = IRExpr_Const(IRConst_U32((UInt)s32));
+            break;
+         }
+         case Iop_8Uto64:
+            e2 = IRExpr_Const(IRConst_U64(
+                    0xFFULL & e->Iex.Unop.arg->Iex.Const.con->Ico.U8));
+            break;
+         case Iop_16Uto64:
+            e2 = IRExpr_Const(IRConst_U64(
+                    0xFFFFULL & e->Iex.Unop.arg->Iex.Const.con->Ico.U16));
+            break;
+         case Iop_8Uto32:
+            e2 = IRExpr_Const(IRConst_U32(
+                    0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U8));
+            break;
+         case Iop_16Uto32:
+            e2 = IRExpr_Const(IRConst_U32(
+                    0xFFFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U16));
+            break;
+         case Iop_32to16:
+            e2 = IRExpr_Const(IRConst_U16(toUShort(
+                    0xFFFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
+            break;
+         case Iop_32to8:
+            e2 = IRExpr_Const(IRConst_U8(toUChar(
+                    0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
+            break;
+         case Iop_32to1:
+            e2 = IRExpr_Const(IRConst_U1(toBool(
+                    1 == (1 & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)
+                 )));
+            break;
+         case Iop_64to1:
+            e2 = IRExpr_Const(IRConst_U1(toBool(
+                    1 == (1 & e->Iex.Unop.arg->Iex.Const.con->Ico.U64)
+                 )));
+            break;
+
+         case Iop_Not64:
+            e2 = IRExpr_Const(IRConst_U64(
+                    ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U64)));
+            break;
+         case Iop_Not32:
+            e2 = IRExpr_Const(IRConst_U32(
+                    ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U32)));
+            break;
+         case Iop_Not16:
+            e2 = IRExpr_Const(IRConst_U16(toUShort(
+                    ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U16))));
+            break;
+         case Iop_Not8:
+            e2 = IRExpr_Const(IRConst_U8(toUChar(
+                    ~ (e->Iex.Unop.arg->Iex.Const.con->Ico.U8))));
+            break;
+
+         case Iop_Not1:
+            e2 = IRExpr_Const(IRConst_U1(
+                    notBool(e->Iex.Unop.arg->Iex.Const.con->Ico.U1)));
+            break;
+
+         case Iop_64to8: {
+            ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
+            w64 &= 0xFFULL;
+            e2 = IRExpr_Const(IRConst_U8( (UChar)w64 ));
+            break;
+         }
+         case Iop_64to16: {
+            ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
+            w64 &= 0xFFFFULL;
+            e2 = IRExpr_Const(IRConst_U16( (UShort)w64 ));
+            break;
+         }
+         case Iop_64to32: {
+            ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
+            w64 &= 0x00000000FFFFFFFFULL;
+            e2 = IRExpr_Const(IRConst_U32( (UInt)w64 ));
+            break;
+         }
+         case Iop_64HIto32: {
+            ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
+            w64 >>= 32;
+            e2 = IRExpr_Const(IRConst_U32( (UInt)w64 ));
+            break;
+         }
+         case Iop_32Uto64:
+            e2 = IRExpr_Const(IRConst_U64(
+                    0xFFFFFFFFULL 
+                    & e->Iex.Unop.arg->Iex.Const.con->Ico.U32));
+            break;
+         case Iop_32Sto64: {
+            /* signed */ Long s64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
+            s64 <<= 32;
+            s64 >>= 32;
+            e2 = IRExpr_Const(IRConst_U64((ULong)s64));
+            break;
+         }
+         case Iop_CmpNEZ8:
+            e2 = IRExpr_Const(IRConst_U1(toBool(
+                    0 != 
+                    (0xFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U8)
+                 )));
+            break;
+         case Iop_CmpNEZ32:
+            e2 = IRExpr_Const(IRConst_U1(toBool(
+                    0 != 
+                    (0xFFFFFFFF & e->Iex.Unop.arg->Iex.Const.con->Ico.U32)
+                 )));
+            break;
+         case Iop_CmpNEZ64:
+            e2 = IRExpr_Const(IRConst_U1(toBool(
+                    0ULL != e->Iex.Unop.arg->Iex.Const.con->Ico.U64
+                 )));
+            break;
+
+         case Iop_CmpwNEZ32: {
+            UInt w32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
+            if (w32 == 0)
+               e2 = IRExpr_Const(IRConst_U32( 0 ));
+            else
+               e2 = IRExpr_Const(IRConst_U32( 0xFFFFFFFF ));
+            break;
+         }
+         case Iop_CmpwNEZ64: {
+            ULong w64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
+            if (w64 == 0)
+               e2 = IRExpr_Const(IRConst_U64( 0 ));
+            else
+               e2 = IRExpr_Const(IRConst_U64( 0xFFFFFFFFFFFFFFFFULL ));
+            break;
+         }
+
+         case Iop_Left32: {
+            UInt u32 = e->Iex.Unop.arg->Iex.Const.con->Ico.U32;
+            Int  s32 = (Int)(u32 & 0xFFFFFFFF);
+            s32 = (s32 | (-s32));
+            e2 = IRExpr_Const( IRConst_U32( (UInt)s32 ));
+            break;
+         }
+
+         case Iop_Left64: {
+            ULong u64 = e->Iex.Unop.arg->Iex.Const.con->Ico.U64;
+            Long  s64 = (Long)u64;
+            s64 = (s64 | (-s64));
+            e2 = IRExpr_Const( IRConst_U64( (ULong)s64 ));
+            break;
+         }
+
+         default: 
+            goto unhandled;
+      }
+   }
+
+   /* BINARY ops */
+   if (e->tag == Iex_Binop) {
+      if (e->Iex.Binop.arg1->tag == Iex_Const
+          && e->Iex.Binop.arg2->tag == Iex_Const) {
+         /* cases where both args are consts */
+         switch (e->Iex.Binop.op) {
+
+            /* -- Or -- */
+            case Iop_Or8:
+               e2 = IRExpr_Const(IRConst_U8(toUChar( 
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
+                        | e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
+               break;
+            case Iop_Or16:
+               e2 = IRExpr_Const(IRConst_U16(toUShort(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U16
+                        | e->Iex.Binop.arg2->Iex.Const.con->Ico.U16))));
+               break;
+            case Iop_Or32:
+               e2 = IRExpr_Const(IRConst_U32(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
+                        | e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
+               break;
+            case Iop_Or64:
+               e2 = IRExpr_Const(IRConst_U64(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
+                        | e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
+               break;
+
+            /* -- Xor -- */
+            case Iop_Xor8:
+               e2 = IRExpr_Const(IRConst_U8(toUChar( 
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
+                        ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
+               break;
+            case Iop_Xor16:
+               e2 = IRExpr_Const(IRConst_U16(toUShort(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U16
+                        ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U16))));
+               break;
+            case Iop_Xor32:
+               e2 = IRExpr_Const(IRConst_U32(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
+                        ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
+               break;
+            case Iop_Xor64:
+               e2 = IRExpr_Const(IRConst_U64(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
+                        ^ e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
+               break;
+
+            /* -- And -- */
+            case Iop_And8:
+               e2 = IRExpr_Const(IRConst_U8(toUChar( 
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
+                        & e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
+               break;
+            case Iop_And32:
+               e2 = IRExpr_Const(IRConst_U32(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
+                        & e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
+               break;
+            case Iop_And64:
+               e2 = IRExpr_Const(IRConst_U64(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
+                        & e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
+               break;
+
+            /* -- Add -- */
+            case Iop_Add8:
+               e2 = IRExpr_Const(IRConst_U8(toUChar( 
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
+                        + e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
+               break;
+            case Iop_Add32:
+               e2 = IRExpr_Const(IRConst_U32(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
+                        + e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
+               break;
+            case Iop_Add64:
+               e2 = IRExpr_Const(IRConst_U64(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
+                        + e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
+               break;
+
+            /* -- Sub -- */
+            case Iop_Sub8:
+               e2 = IRExpr_Const(IRConst_U8(toUChar( 
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U8
+                        - e->Iex.Binop.arg2->Iex.Const.con->Ico.U8))));
+               break;
+            case Iop_Sub32:
+               e2 = IRExpr_Const(IRConst_U32(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
+                        - e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
+               break;
+            case Iop_Sub64:
+               e2 = IRExpr_Const(IRConst_U64(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
+                        - e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
+               break;
+
+            /* -- Max32U -- */
+            case Iop_Max32U: {
+               UInt u32a = e->Iex.Binop.arg1->Iex.Const.con->Ico.U32;
+               UInt u32b = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
+               UInt res  = u32a > u32b ? u32a : u32b;
+               e2 = IRExpr_Const(IRConst_U32(res));
+               break;
+            }
+
+            /* -- Mul -- */
+            case Iop_Mul32:
+               e2 = IRExpr_Const(IRConst_U32(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
+                        * e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)));
+               break;
+            case Iop_Mul64:
+               e2 = IRExpr_Const(IRConst_U64(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
+                        * e->Iex.Binop.arg2->Iex.Const.con->Ico.U64)));
+               break;
+
+            case Iop_MullS32: {
+               /* very paranoid */
+               UInt  u32a = e->Iex.Binop.arg1->Iex.Const.con->Ico.U32;
+               UInt  u32b = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
+               Int   s32a = (Int)u32a;
+               Int   s32b = (Int)u32b;
+               Long  s64a = (Long)s32a;
+               Long  s64b = (Long)s32b;
+               Long  sres = s64a * s64b;
+               ULong ures = (ULong)sres;
+               e2 = IRExpr_Const(IRConst_U64(ures));
+               break;
+            }
+
+            /* -- Shl -- */
+            case Iop_Shl32:
+               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
+               shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
+               if (shift >= 0 && shift <= 31)
+                  e2 = IRExpr_Const(IRConst_U32(
+                          (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
+                           << shift)));
+               break;
+            case Iop_Shl64:
+               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
+               shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
+               if (shift >= 0 && shift <= 63)
+                  e2 = IRExpr_Const(IRConst_U64(
+                          (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
+                           << shift)));
+               break;
+
+            /* -- Sar -- */
+            case Iop_Sar32: {
+               /* paranoid ... */
+               /*signed*/ Int s32;
+               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
+               s32   = (Int)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32);
+               shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
+               if (shift >= 0 && shift <= 31) {
+                  s32 >>=/*signed*/ shift;
+                  e2 = IRExpr_Const(IRConst_U32((UInt)s32));
+               }
+               break;
+            }
+            case Iop_Sar64: {
+               /* paranoid ... */
+               /*signed*/ Long s64;
+               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
+               s64   = (Long)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64);
+               shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
+               if (shift >= 0 && shift <= 63) {
+                  s64 >>=/*signed*/ shift;
+                  e2 = IRExpr_Const(IRConst_U64((ULong)s64));
+               }
+               break;
+            }
+
+            /* -- Shr -- */
+            case Iop_Shr32: {
+               /* paranoid ... */
+               /*unsigned*/ UInt u32;
+               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
+               u32   = (UInt)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32);
+               shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
+               if (shift >= 0 && shift <= 31) {
+                  u32 >>=/*unsigned*/ shift;
+                  e2 = IRExpr_Const(IRConst_U32(u32));
+               }
+               break;
+            }
+            case Iop_Shr64: {
+               /* paranoid ... */
+               /*unsigned*/ ULong u64;
+               vassert(e->Iex.Binop.arg2->Iex.Const.con->tag == Ico_U8);
+               u64   = (ULong)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U64);
+               shift = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U8);
+               if (shift >= 0 && shift <= 63) {
+                  u64 >>=/*unsigned*/ shift;
+                  e2 = IRExpr_Const(IRConst_U64(u64));
+               }
+               break;
+            }
+
+            /* -- CmpEQ -- */
+            case Iop_CmpEQ32:
+               e2 = IRExpr_Const(IRConst_U1(toBool(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
+                        == e->Iex.Binop.arg2->Iex.Const.con->Ico.U32))));
+               break;
+            case Iop_CmpEQ64:
+               e2 = IRExpr_Const(IRConst_U1(toBool(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
+                        == e->Iex.Binop.arg2->Iex.Const.con->Ico.U64))));
+               break;
+
+            /* -- CmpNE -- */
+            case Iop_CmpNE8:
+               e2 = IRExpr_Const(IRConst_U1(toBool(
+                       ((0xFF & e->Iex.Binop.arg1->Iex.Const.con->Ico.U8)
+                        != (0xFF & e->Iex.Binop.arg2->Iex.Const.con->Ico.U8)))));
+               break;
+            case Iop_CmpNE32:
+               e2 = IRExpr_Const(IRConst_U1(toBool(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U32
+                        != e->Iex.Binop.arg2->Iex.Const.con->Ico.U32))));
+               break;
+            case Iop_CmpNE64:
+               e2 = IRExpr_Const(IRConst_U1(toBool(
+                       (e->Iex.Binop.arg1->Iex.Const.con->Ico.U64
+                        != e->Iex.Binop.arg2->Iex.Const.con->Ico.U64))));
+               break;
+
+            /* -- CmpLEU -- */
+            case Iop_CmpLE32U:
+               e2 = IRExpr_Const(IRConst_U1(toBool(
+                       ((UInt)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
+                        <= (UInt)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
+               break;
+
+            /* -- CmpLES -- */
+            case Iop_CmpLE32S:
+               e2 = IRExpr_Const(IRConst_U1(toBool(
+                       ((Int)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
+                        <= (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
+               break;
+
+            /* -- CmpLTS -- */
+            case Iop_CmpLT32S:
+               e2 = IRExpr_Const(IRConst_U1(toBool(
+                       ((Int)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
+                        < (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
+               break;
+
+            /* -- CmpLTU -- */
+            case Iop_CmpLT32U:
+               e2 = IRExpr_Const(IRConst_U1(toBool(
+                       ((UInt)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)
+                        < (UInt)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)))));
+               break;
+
+            /* -- CmpORD -- */
+            case Iop_CmpORD32S: {
+               /* very paranoid */
+               UInt  u32a = e->Iex.Binop.arg1->Iex.Const.con->Ico.U32;
+               UInt  u32b = e->Iex.Binop.arg2->Iex.Const.con->Ico.U32;
+               Int   s32a = (Int)u32a;
+               Int   s32b = (Int)u32b;
+               Int   r = 0x2; /* EQ */
+               if (s32a < s32b) {
+                  r = 0x8; /* LT */
+               } 
+               else if (s32a > s32b) {
+                  r = 0x4; /* GT */
+               }
+               e2 = IRExpr_Const(IRConst_U32(r));
+               break;
+            }
+
+            /* -- nHLto2n -- */
+            case Iop_32HLto64:
+               e2 = IRExpr_Const(IRConst_U64(
+                       (((ULong)(e->Iex.Binop.arg1->Iex.Const.con->Ico.U32)) << 32)
+                       | ((ULong)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32)) 
+                    ));
+               break;
+            case Iop_64HLto128:
+               /* We can't fold this, because there is no way to
+                  express he result in IR, but at least pretend to
+                  handle it, so as to stop getting blasted with
+                  no-rule-for-this-primop messages. */
+               break;
+
+            default:
+               goto unhandled;
+         }
+
+      } else {
+
+         /* other cases (identities, etc) */
+
+         /* Shl64/Shr64(x,0) ==> x */
+         if ((e->Iex.Binop.op == Iop_Shl64 || e->Iex.Binop.op == Iop_Shr64)
+             && e->Iex.Binop.arg2->tag == Iex_Const
+             && e->Iex.Binop.arg2->Iex.Const.con->Ico.U8 == 0) {
+            e2 = e->Iex.Binop.arg1;
+         } else
+
+         /* Shl32/Shr32(x,0) ==> x */
+         if ((e->Iex.Binop.op == Iop_Shl32 || e->Iex.Binop.op == Iop_Shr32)
+             && e->Iex.Binop.arg2->tag == Iex_Const
+             && e->Iex.Binop.arg2->Iex.Const.con->Ico.U8 == 0) {
+            e2 = e->Iex.Binop.arg1;
+         } else
+
+         /* Or8(x,0) ==> x */
+         if ((e->Iex.Binop.op == Iop_Or8)
+             && e->Iex.Binop.arg2->tag == Iex_Const
+             && e->Iex.Binop.arg2->Iex.Const.con->Ico.U8 == 0) {
+            e2 = e->Iex.Binop.arg1;
+         } else
+
+         /* Or16(x,0) ==> x */
+         if ((e->Iex.Binop.op == Iop_Or16)
+             && e->Iex.Binop.arg2->tag == Iex_Const
+             && e->Iex.Binop.arg2->Iex.Const.con->Ico.U16 == 0) {
+            e2 = e->Iex.Binop.arg1;
+         } else
+
+         /* Or32/Add32/Max32U(x,0) ==> x */
+         if ((e->Iex.Binop.op == Iop_Add32 
+              || e->Iex.Binop.op == Iop_Or32 || e->Iex.Binop.op == Iop_Max32U)
+             && e->Iex.Binop.arg2->tag == Iex_Const
+             && e->Iex.Binop.arg2->Iex.Const.con->Ico.U32 == 0) {
+            e2 = e->Iex.Binop.arg1;
+         } else
+
+         /* Add32(t,t) ==> t << 1.  Memcheck doesn't understand that
+            x+x produces a defined least significant bit, and it seems
+            simplest just to get rid of the problem by rewriting it
+            out, since the opportunity to do so exists. */
+         if (e->Iex.Binop.op == Iop_Add32
+             && e->Iex.Binop.arg1->tag == Iex_RdTmp
+             && e->Iex.Binop.arg2->tag == Iex_RdTmp
+             && e->Iex.Binop.arg1->Iex.RdTmp.tmp 
+                == e->Iex.Binop.arg2->Iex.RdTmp.tmp) {
+            e2 = IRExpr_Binop(Iop_Shl32,
+                              e->Iex.Binop.arg1,
+                              IRExpr_Const(IRConst_U8(1)));
+         } else
+
+         /* Add64(t,t) ==> t << 1;  rationale as for Add32(t,t) above. */
+         if (e->Iex.Binop.op == Iop_Add64
+             && e->Iex.Binop.arg1->tag == Iex_RdTmp
+             && e->Iex.Binop.arg2->tag == Iex_RdTmp
+             && e->Iex.Binop.arg1->Iex.RdTmp.tmp 
+                == e->Iex.Binop.arg2->Iex.RdTmp.tmp) {
+            e2 = IRExpr_Binop(Iop_Shl64,
+                              e->Iex.Binop.arg1,
+                              IRExpr_Const(IRConst_U8(1)));
+         } else
+
+         /* Add8(t,t) ==> t << 1;  rationale as for Add32(t,t) above. */
+         if (e->Iex.Binop.op == Iop_Add8
+             && e->Iex.Binop.arg1->tag == Iex_RdTmp
+             && e->Iex.Binop.arg2->tag == Iex_RdTmp
+             && e->Iex.Binop.arg1->Iex.RdTmp.tmp 
+                == e->Iex.Binop.arg2->Iex.RdTmp.tmp) {
+            e2 = IRExpr_Binop(Iop_Shl8,
+                              e->Iex.Binop.arg1,
+                              IRExpr_Const(IRConst_U8(1)));
+         } else
+         /* NB no Add16(t,t) case yet as no known test case exists */
+
+         /* Or64/Add64(x,0) ==> x */
+         if ((e->Iex.Binop.op == Iop_Add64 || e->Iex.Binop.op == Iop_Or64)
+             && e->Iex.Binop.arg2->tag == Iex_Const
+             && e->Iex.Binop.arg2->Iex.Const.con->Ico.U64 == 0) {
+            e2 = e->Iex.Binop.arg1;
+         } else
+
+         /* And32(x,0xFFFFFFFF) ==> x */
+         if (e->Iex.Binop.op == Iop_And32
+             && e->Iex.Binop.arg2->tag == Iex_Const
+             && e->Iex.Binop.arg2->Iex.Const.con->Ico.U32 == 0xFFFFFFFF) {
+            e2 = e->Iex.Binop.arg1;
+         } else
+
+         /* And32(x,0) ==> 0 */
+         if (e->Iex.Binop.op == Iop_And32
+             && e->Iex.Binop.arg2->tag == Iex_Const
+             && e->Iex.Binop.arg2->Iex.Const.con->Ico.U32 == 0) {
+            e2 = IRExpr_Const(IRConst_U32(0));
+         } else
+
+         /* And32/Shl32(0,x) ==> 0 */
+         if ((e->Iex.Binop.op == Iop_And32 || e->Iex.Binop.op == Iop_Shl32)
+             && e->Iex.Binop.arg1->tag == Iex_Const
+             && e->Iex.Binop.arg1->Iex.Const.con->Ico.U32 == 0) {
+            e2 = IRExpr_Const(IRConst_U32(0));
+         } else
+
+         /* Or8(0,x) ==> x */
+         if (e->Iex.Binop.op == Iop_Or8
+             && e->Iex.Binop.arg1->tag == Iex_Const
+             && e->Iex.Binop.arg1->Iex.Const.con->Ico.U8 == 0) {
+            e2 = e->Iex.Binop.arg2;
+         } else
+
+         /* Or32/Max32U(0,x) ==> x */
+         if ((e->Iex.Binop.op == Iop_Or32 || e->Iex.Binop.op == Iop_Max32U)
+             && e->Iex.Binop.arg1->tag == Iex_Const
+             && e->Iex.Binop.arg1->Iex.Const.con->Ico.U32 == 0) {
+            e2 = e->Iex.Binop.arg2;
+         } else
+
+         /* Or64(0,x) ==> x */
+         if (e->Iex.Binop.op == Iop_Or64
+             && e->Iex.Binop.arg1->tag == Iex_Const
+             && e->Iex.Binop.arg1->Iex.Const.con->Ico.U64 == 0) {
+            e2 = e->Iex.Binop.arg2;
+         } else
+
+         /* Or8/16/32/64/V128(t,t) ==> t, for some IRTemp t */
+         /* And8/16/32/64(t,t) ==> t, for some IRTemp t */
+         /* Max32U(t,t) ==> t, for some IRTemp t */
+         switch (e->Iex.Binop.op) {
+            case Iop_And64: case Iop_And32:
+            case Iop_And16: case Iop_And8:
+            case Iop_Or64: case Iop_Or32:
+            case Iop_Or16: case Iop_Or8: case Iop_OrV128:
+            case Iop_Max32U:
+               if (sameIRTemps(e->Iex.Binop.arg1, e->Iex.Binop.arg2))
+                  e2 = e->Iex.Binop.arg1;
+               break;
+            default:
+               break;
+         }
+
+         /* Xor8/16/32/64/V128(t,t) ==> 0, for some IRTemp t */
+         /* Sub32/64(t,t) ==> 0, for some IRTemp t */
+         switch (e->Iex.Binop.op) {
+            case Iop_Xor64: case Iop_Xor32:
+            case Iop_Xor16: case Iop_Xor8:
+            case Iop_XorV128:
+            case Iop_Sub64: case Iop_Sub32:
+               if (sameIRTemps(e->Iex.Binop.arg1, e->Iex.Binop.arg2))
+                  e2 = mkZeroOfPrimopResultType(e->Iex.Binop.op);
+               break;
+            default:
+               break;
+         }
+
+         switch (e->Iex.Binop.op) {
+            case Iop_CmpEQ64:
+            case Iop_CmpEQ8x8:
+            case Iop_CmpEQ8x16:
+               if (sameIRTemps(e->Iex.Binop.arg1, e->Iex.Binop.arg2))
+                  e2 = mkOnesOfPrimopResultType(e->Iex.Binop.op);
+               break;
+            default:
+               break;
+         }
+
+      }
+   }
+
+   /* Mux0X */
+   if (e->tag == Iex_Mux0X) {
+      /* is the discriminant is a constant? */
+      if (e->Iex.Mux0X.cond->tag == Iex_Const) {
+         Bool zero;
+         /* assured us by the IR type rules */
+         vassert(e->Iex.Mux0X.cond->Iex.Const.con->tag == Ico_U8);
+         zero = toBool(0 == (0xFF & e->Iex.Mux0X.cond
+                                     ->Iex.Const.con->Ico.U8));
+         e2 = zero ? e->Iex.Mux0X.expr0 : e->Iex.Mux0X.exprX;
+      }
+      else
+      /* are the arms identical? (pretty weedy test) */
+      if (sameIRTempsOrIcoU32s(e->Iex.Mux0X.expr0,
+                               e->Iex.Mux0X.exprX)) {
+         e2 = e->Iex.Mux0X.expr0;
+      }
+   }
+
+   /* Show cases where we've found but not folded 'op(t,t)'. */
+   if (0 && e == e2 && e->tag == Iex_Binop 
+       && sameIRTemps(e->Iex.Binop.arg1, e->Iex.Binop.arg2)) {
+      vex_printf("IDENT: ");
+      ppIRExpr(e); vex_printf("\n");
+   }
+
+   /* Show the overall results of folding. */
+   if (DEBUG_IROPT && e2 != e) {
+      vex_printf("FOLD: "); 
+      ppIRExpr(e); vex_printf("  ->  ");
+      ppIRExpr(e2); vex_printf("\n");
+   }
+
+   return e2;
+
+ unhandled:
+#  if 0
+   vex_printf("\n\n");
+   ppIRExpr(e);
+   vpanic("fold_Expr: no rule for the above");
+#  else
+   if (vex_control.iropt_verbosity > 0) {
+      vex_printf("vex iropt: fold_Expr: no rule for: ");
+      ppIRExpr(e);
+      vex_printf("\n");
+   }
+   return e2;
+#  endif
+}
+
+
+/* Apply the subst to a simple 1-level expression -- guaranteed to be
+   1-level due to previous flattening pass. */
+
+static IRExpr* subst_Expr ( IRExpr** env, IRExpr* ex )
+{
+   switch (ex->tag) {
+      case Iex_RdTmp:
+         if (env[(Int)ex->Iex.RdTmp.tmp] != NULL) {
+            return env[(Int)ex->Iex.RdTmp.tmp];
+         } else {
+            /* not bound in env */
+            return ex;
+         }
+
+      case Iex_Const:
+      case Iex_Get:
+         return ex;
+
+      case Iex_GetI:
+         vassert(isIRAtom(ex->Iex.GetI.ix));
+         return IRExpr_GetI(
+            ex->Iex.GetI.descr,
+            subst_Expr(env, ex->Iex.GetI.ix),
+            ex->Iex.GetI.bias
+         );
+
+      case Iex_Qop:
+         vassert(isIRAtom(ex->Iex.Qop.arg1));
+         vassert(isIRAtom(ex->Iex.Qop.arg2));
+         vassert(isIRAtom(ex->Iex.Qop.arg3));
+         vassert(isIRAtom(ex->Iex.Qop.arg4));
+         return IRExpr_Qop(
+                   ex->Iex.Qop.op,
+                   subst_Expr(env, ex->Iex.Qop.arg1),
+                   subst_Expr(env, ex->Iex.Qop.arg2),
+                   subst_Expr(env, ex->Iex.Qop.arg3),
+                   subst_Expr(env, ex->Iex.Qop.arg4)
+                );
+
+      case Iex_Triop:
+         vassert(isIRAtom(ex->Iex.Triop.arg1));
+         vassert(isIRAtom(ex->Iex.Triop.arg2));
+         vassert(isIRAtom(ex->Iex.Triop.arg3));
+         return IRExpr_Triop(
+                   ex->Iex.Triop.op,
+                   subst_Expr(env, ex->Iex.Triop.arg1),
+                   subst_Expr(env, ex->Iex.Triop.arg2),
+                   subst_Expr(env, ex->Iex.Triop.arg3)
+                );
+
+      case Iex_Binop:
+         vassert(isIRAtom(ex->Iex.Binop.arg1));
+         vassert(isIRAtom(ex->Iex.Binop.arg2));
+         return IRExpr_Binop(
+                   ex->Iex.Binop.op,
+                   subst_Expr(env, ex->Iex.Binop.arg1),
+                   subst_Expr(env, ex->Iex.Binop.arg2)
+                );
+
+      case Iex_Unop:
+         vassert(isIRAtom(ex->Iex.Unop.arg));
+         return IRExpr_Unop(
+                   ex->Iex.Unop.op,
+                   subst_Expr(env, ex->Iex.Unop.arg)
+                );
+
+      case Iex_Load:
+         vassert(isIRAtom(ex->Iex.Load.addr));
+         return IRExpr_Load(
+                   ex->Iex.Load.end,
+                   ex->Iex.Load.ty,
+                   subst_Expr(env, ex->Iex.Load.addr)
+                );
+
+      case Iex_CCall: {
+         Int      i;
+         IRExpr** args2 = shallowCopyIRExprVec(ex->Iex.CCall.args);
+         for (i = 0; args2[i]; i++) {
+            vassert(isIRAtom(args2[i]));
+            args2[i] = subst_Expr(env, args2[i]);
+         }
+         return IRExpr_CCall(
+                   ex->Iex.CCall.cee,
+                   ex->Iex.CCall.retty,
+                   args2 
+                );
+      }
+
+      case Iex_Mux0X:
+         vassert(isIRAtom(ex->Iex.Mux0X.cond));
+         vassert(isIRAtom(ex->Iex.Mux0X.expr0));
+         vassert(isIRAtom(ex->Iex.Mux0X.exprX));
+         return IRExpr_Mux0X(
+                   subst_Expr(env, ex->Iex.Mux0X.cond),
+                   subst_Expr(env, ex->Iex.Mux0X.expr0),
+                   subst_Expr(env, ex->Iex.Mux0X.exprX)
+                );
+
+      default:
+         vex_printf("\n\n"); ppIRExpr(ex);
+         vpanic("subst_Expr");
+      
+   }
+}
+
+
+/* Apply the subst to stmt, then fold the result as much as possible.
+   Much simplified due to stmt being previously flattened.  As a
+   result of this, the stmt may wind up being turned into a no-op.  
+*/
+static IRStmt* subst_and_fold_Stmt ( IRExpr** env, IRStmt* st )
+{
+#  if 0
+   vex_printf("\nsubst and fold stmt\n");
+   ppIRStmt(st);
+   vex_printf("\n");
+#  endif
+
+   switch (st->tag) {
+      case Ist_AbiHint:
+         vassert(isIRAtom(st->Ist.AbiHint.base));
+         vassert(isIRAtom(st->Ist.AbiHint.nia));
+         return IRStmt_AbiHint(
+                   fold_Expr(subst_Expr(env, st->Ist.AbiHint.base)),
+                   st->Ist.AbiHint.len,
+                   fold_Expr(subst_Expr(env, st->Ist.AbiHint.nia))
+                );
+      case Ist_Put:
+         vassert(isIRAtom(st->Ist.Put.data));
+         return IRStmt_Put(
+                   st->Ist.Put.offset, 
+                   fold_Expr(subst_Expr(env, st->Ist.Put.data)) 
+                );
+
+      case Ist_PutI:
+         vassert(isIRAtom(st->Ist.PutI.ix));
+         vassert(isIRAtom(st->Ist.PutI.data));
+         return IRStmt_PutI(
+                   st->Ist.PutI.descr,
+                   fold_Expr(subst_Expr(env, st->Ist.PutI.ix)),
+                   st->Ist.PutI.bias,
+                   fold_Expr(subst_Expr(env, st->Ist.PutI.data))
+                );
+
+      case Ist_WrTmp:
+         /* This is the one place where an expr (st->Ist.WrTmp.data) is
+            allowed to be more than just a constant or a tmp. */
+         return IRStmt_WrTmp(
+                   st->Ist.WrTmp.tmp,
+                   fold_Expr(subst_Expr(env, st->Ist.WrTmp.data))
+                );
+
+      case Ist_Store:
+         vassert(isIRAtom(st->Ist.Store.addr));
+         vassert(isIRAtom(st->Ist.Store.data));
+         return IRStmt_Store(
+                   st->Ist.Store.end,
+                   fold_Expr(subst_Expr(env, st->Ist.Store.addr)),
+                   fold_Expr(subst_Expr(env, st->Ist.Store.data))
+                );
+
+      case Ist_CAS: {
+         IRCAS *cas, *cas2;
+         cas = st->Ist.CAS.details;
+         vassert(isIRAtom(cas->addr));
+         vassert(cas->expdHi == NULL || isIRAtom(cas->expdHi));
+         vassert(isIRAtom(cas->expdLo));
+         vassert(cas->dataHi == NULL || isIRAtom(cas->dataHi));
+         vassert(isIRAtom(cas->dataLo));
+         cas2 = mkIRCAS(
+                   cas->oldHi, cas->oldLo, cas->end, 
+                   fold_Expr(subst_Expr(env, cas->addr)),
+                   cas->expdHi ? fold_Expr(subst_Expr(env, cas->expdHi)) : NULL,
+                   fold_Expr(subst_Expr(env, cas->expdLo)),
+                   cas->dataHi ? fold_Expr(subst_Expr(env, cas->dataHi)) : NULL,
+                   fold_Expr(subst_Expr(env, cas->dataLo))
+                );
+         return IRStmt_CAS(cas2);
+      }
+
+      case Ist_LLSC:
+         vassert(isIRAtom(st->Ist.LLSC.addr));
+         if (st->Ist.LLSC.storedata)
+            vassert(isIRAtom(st->Ist.LLSC.storedata));
+         return IRStmt_LLSC(
+                   st->Ist.LLSC.end,
+                   st->Ist.LLSC.result,
+                   fold_Expr(subst_Expr(env, st->Ist.LLSC.addr)),
+                   st->Ist.LLSC.storedata
+                      ? fold_Expr(subst_Expr(env, st->Ist.LLSC.storedata))
+                      : NULL
+                );
+
+      case Ist_Dirty: {
+         Int     i;
+         IRDirty *d, *d2;
+         d = st->Ist.Dirty.details;
+         d2 = emptyIRDirty();
+         *d2 = *d;
+         d2->args = shallowCopyIRExprVec(d2->args);
+         if (d2->mFx != Ifx_None) {
+            vassert(isIRAtom(d2->mAddr));
+            d2->mAddr = fold_Expr(subst_Expr(env, d2->mAddr));
+         }
+         vassert(isIRAtom(d2->guard));
+         d2->guard = fold_Expr(subst_Expr(env, d2->guard));
+         for (i = 0; d2->args[i]; i++) {
+            vassert(isIRAtom(d2->args[i]));
+            d2->args[i] = fold_Expr(subst_Expr(env, d2->args[i]));
+         }
+         return IRStmt_Dirty(d2);
+      }
+
+      case Ist_IMark:
+         return IRStmt_IMark(st->Ist.IMark.addr, st->Ist.IMark.len);
+
+      case Ist_NoOp:
+         return IRStmt_NoOp();
+
+      case Ist_MBE:
+         return IRStmt_MBE(st->Ist.MBE.event);
+
+      case Ist_Exit: {
+         IRExpr* fcond;
+         vassert(isIRAtom(st->Ist.Exit.guard));
+         fcond = fold_Expr(subst_Expr(env, st->Ist.Exit.guard));
+         if (fcond->tag == Iex_Const) {
+            /* Interesting.  The condition on this exit has folded down to
+               a constant. */
+            vassert(fcond->Iex.Const.con->tag == Ico_U1);
+            vassert(fcond->Iex.Const.con->Ico.U1 == False
+                    || fcond->Iex.Const.con->Ico.U1 == True);
+            if (fcond->Iex.Const.con->Ico.U1 == False) {
+               /* exit is never going to happen, so dump the statement. */
+               return IRStmt_NoOp();
+            } else {
+               vassert(fcond->Iex.Const.con->Ico.U1 == True);
+               /* Hmmm.  The exit has become unconditional.  Leave it
+                  as it is for now, since we'd have to truncate the BB
+                  at this point, which is tricky.  Such truncation is
+                  done later by the dead-code elimination pass. */
+               /* fall out into the reconstruct-the-exit code. */
+               if (vex_control.iropt_verbosity > 0) 
+                  /* really a misuse of vex_control.iropt_verbosity */
+                  vex_printf("vex iropt: IRStmt_Exit became unconditional\n");
+            }
+         }
+         return IRStmt_Exit(fcond, st->Ist.Exit.jk, st->Ist.Exit.dst);
+      }
+
+   default:
+      vex_printf("\n"); ppIRStmt(st);
+      vpanic("subst_and_fold_Stmt");
+   }
+}
+
+
+IRSB* cprop_BB ( IRSB* in )
+{
+   Int      i;
+   IRSB*    out;
+   IRStmt*  st2;
+   Int      n_tmps = in->tyenv->types_used;
+   IRExpr** env = LibVEX_Alloc(n_tmps * sizeof(IRExpr*));
+
+   out = emptyIRSB();
+   out->tyenv = deepCopyIRTypeEnv( in->tyenv );
+
+   /* Set up the env with which travels forward.  This holds a
+      substitution, mapping IRTemps to atoms, that is, IRExprs which
+      are either IRTemps or IRConsts.  Thus, copy and constant
+      propagation is done.  The environment is to be applied as we
+      move along.  Keys are IRTemps.  Values are IRExpr*s.
+   */
+   for (i = 0; i < n_tmps; i++)
+      env[i] = NULL;
+
+   /* For each original SSA-form stmt ... */
+   for (i = 0; i < in->stmts_used; i++) {
+
+      /* First apply the substitution to the current stmt.  This
+         propagates in any constants and tmp-tmp assignments
+         accumulated prior to this point.  As part of the subst_Stmt
+         call, also then fold any constant expressions resulting. */
+
+      st2 = in->stmts[i];
+
+      /* perhaps st2 is already a no-op? */
+      if (st2->tag == Ist_NoOp) continue;
+
+      st2 = subst_and_fold_Stmt( env, st2 );
+
+      /* If the statement has been folded into a no-op, forget it. */
+      if (st2->tag == Ist_NoOp) continue;
+
+      /* Now consider what the stmt looks like.  If it's of the form
+         't = const' or 't1 = t2', add it to the running environment
+         and not to the output BB.  Otherwise, add it to the output
+         BB.  Note, we choose not to propagate const when const is an
+         F64i, so that F64i literals can be CSE'd later.  This helps
+         x86 floating point code generation. */
+
+      if (st2->tag == Ist_WrTmp 
+          && st2->Ist.WrTmp.data->tag == Iex_Const
+          && st2->Ist.WrTmp.data->Iex.Const.con->tag != Ico_F64i) {
+         /* 't = const' -- add to env.  
+             The pair (IRTemp, IRExpr*) is added. */
+         env[(Int)(st2->Ist.WrTmp.tmp)] = st2->Ist.WrTmp.data;
+      }
+      else
+      if (st2->tag == Ist_WrTmp && st2->Ist.WrTmp.data->tag == Iex_RdTmp) {
+         /* 't1 = t2' -- add to env.  
+             The pair (IRTemp, IRExpr*) is added. */
+         env[(Int)(st2->Ist.WrTmp.tmp)] = st2->Ist.WrTmp.data;
+      }
+      else {
+         /* Not interesting, copy st2 into the output block. */
+         addStmtToIRSB( out, st2 );
+      }
+   }
+
+   out->next     = subst_Expr( env, in->next );
+   out->jumpkind = in->jumpkind;
+   return out;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Dead code (t = E) removal                               ---*/
+/*---------------------------------------------------------------*/
+
+/* As a side effect, also removes all code following an unconditional
+   side exit. */
+
+/* The type of the HashHW map is: a map from IRTemp to nothing
+   -- really just operating a set or IRTemps.
+*/
+
+inline
+static void addUses_Temp ( Bool* set, IRTemp tmp )
+{
+   set[(Int)tmp] = True;
+}
+
+static void addUses_Expr ( Bool* set, IRExpr* e )
+{
+   Int i;
+   switch (e->tag) {
+      case Iex_GetI:
+         addUses_Expr(set, e->Iex.GetI.ix);
+         return;
+      case Iex_Mux0X:
+         addUses_Expr(set, e->Iex.Mux0X.cond);
+         addUses_Expr(set, e->Iex.Mux0X.expr0);
+         addUses_Expr(set, e->Iex.Mux0X.exprX);
+         return;
+      case Iex_CCall:
+         for (i = 0; e->Iex.CCall.args[i]; i++)
+            addUses_Expr(set, e->Iex.CCall.args[i]);
+         return;
+      case Iex_Load:
+         addUses_Expr(set, e->Iex.Load.addr);
+         return;
+      case Iex_Qop:
+         addUses_Expr(set, e->Iex.Qop.arg1);
+         addUses_Expr(set, e->Iex.Qop.arg2);
+         addUses_Expr(set, e->Iex.Qop.arg3);
+         addUses_Expr(set, e->Iex.Qop.arg4);
+         return;
+      case Iex_Triop:
+         addUses_Expr(set, e->Iex.Triop.arg1);
+         addUses_Expr(set, e->Iex.Triop.arg2);
+         addUses_Expr(set, e->Iex.Triop.arg3);
+         return;
+      case Iex_Binop:
+         addUses_Expr(set, e->Iex.Binop.arg1);
+         addUses_Expr(set, e->Iex.Binop.arg2);
+         return;
+      case Iex_Unop:
+         addUses_Expr(set, e->Iex.Unop.arg);
+         return;
+      case Iex_RdTmp:
+         addUses_Temp(set, e->Iex.RdTmp.tmp);
+         return;
+      case Iex_Const:
+      case Iex_Get:
+         return;
+      default:
+         vex_printf("\n");
+         ppIRExpr(e);
+         vpanic("addUses_Expr");
+   }
+}
+
+static void addUses_Stmt ( Bool* set, IRStmt* st )
+{
+   Int      i;
+   IRDirty* d;
+   IRCAS*   cas;
+   switch (st->tag) {
+      case Ist_AbiHint:
+         addUses_Expr(set, st->Ist.AbiHint.base);
+         addUses_Expr(set, st->Ist.AbiHint.nia);
+         return;
+      case Ist_PutI:
+         addUses_Expr(set, st->Ist.PutI.ix);
+         addUses_Expr(set, st->Ist.PutI.data);
+         return;
+      case Ist_WrTmp:
+         addUses_Expr(set, st->Ist.WrTmp.data);
+         return;
+      case Ist_Put:
+         addUses_Expr(set, st->Ist.Put.data);
+         return;
+      case Ist_Store:
+         addUses_Expr(set, st->Ist.Store.addr);
+         addUses_Expr(set, st->Ist.Store.data);
+         return;
+      case Ist_CAS:
+         cas = st->Ist.CAS.details;
+         addUses_Expr(set, cas->addr);
+         if (cas->expdHi)
+            addUses_Expr(set, cas->expdHi);
+         addUses_Expr(set, cas->expdLo);
+         if (cas->dataHi)
+            addUses_Expr(set, cas->dataHi);
+         addUses_Expr(set, cas->dataLo);
+         return;
+      case Ist_LLSC:
+         addUses_Expr(set, st->Ist.LLSC.addr);
+         if (st->Ist.LLSC.storedata)
+            addUses_Expr(set, st->Ist.LLSC.storedata);
+         return;
+      case Ist_Dirty:
+         d = st->Ist.Dirty.details;
+         if (d->mFx != Ifx_None)
+            addUses_Expr(set, d->mAddr);
+         addUses_Expr(set, d->guard);
+         for (i = 0; d->args[i] != NULL; i++)
+            addUses_Expr(set, d->args[i]);
+         return;
+      case Ist_NoOp:
+      case Ist_IMark:
+      case Ist_MBE:
+         return;
+      case Ist_Exit:
+         addUses_Expr(set, st->Ist.Exit.guard);
+         return;
+      default:
+         vex_printf("\n");
+         ppIRStmt(st);
+         vpanic("addUses_Stmt");
+   }
+}
+
+
+/* Is this literally IRExpr_Const(IRConst_U1(False)) ? */
+static Bool isZeroU1 ( IRExpr* e )
+{
+   return toBool( e->tag == Iex_Const
+                  && e->Iex.Const.con->tag == Ico_U1
+                  && e->Iex.Const.con->Ico.U1 == False );
+}
+
+/* Is this literally IRExpr_Const(IRConst_U1(True)) ? */
+static Bool isOneU1 ( IRExpr* e )
+{
+   return toBool( e->tag == Iex_Const
+                  && e->Iex.Const.con->tag == Ico_U1
+                  && e->Iex.Const.con->Ico.U1 == True );
+}
+
+
+/* Note, this destructively modifies the given IRSB. */
+
+/* Scan backwards through statements, carrying a set of IRTemps which
+   are known to be used after the current point.  On encountering 't =
+   E', delete the binding if it is not used.  Otherwise, add any temp
+   uses to the set and keep on moving backwards.
+
+   As an enhancement, the first (backwards) pass searches for IR exits
+   with always-taken conditions and notes the location of the earliest
+   one in the block.  If any such are found, a second pass copies the
+   exit destination and jump kind to the bb-end.  Then, the exit and
+   all statements following it are turned into no-ops.
+*/
+
+/* notstatic */ void do_deadcode_BB ( IRSB* bb )
+{
+   Int     i, i_unconditional_exit;
+   Int     n_tmps = bb->tyenv->types_used;
+   Bool*   set = LibVEX_Alloc(n_tmps * sizeof(Bool));
+   IRStmt* st;
+
+   for (i = 0; i < n_tmps; i++)
+      set[i] = False;
+
+   /* start off by recording IRTemp uses in the next field. */
+   addUses_Expr(set, bb->next);
+
+   /* First pass */
+
+   /* Work backwards through the stmts */
+   i_unconditional_exit = -1;
+   for (i = bb->stmts_used-1; i >= 0; i--) {
+      st = bb->stmts[i];
+      if (st->tag == Ist_NoOp)
+         continue;
+      /* take note of any unconditional exits */
+      if (st->tag == Ist_Exit
+          && isOneU1(st->Ist.Exit.guard))
+         i_unconditional_exit = i;
+      if (st->tag == Ist_WrTmp
+          && set[(Int)(st->Ist.WrTmp.tmp)] == False) {
+          /* it's an IRTemp which never got used.  Delete it. */
+         if (DEBUG_IROPT) {
+            vex_printf("DEAD: ");
+            ppIRStmt(st);
+            vex_printf("\n");
+         }
+         bb->stmts[i] = IRStmt_NoOp();
+      }
+      else
+      if (st->tag == Ist_Dirty
+          && st->Ist.Dirty.details->guard
+          && isZeroU1(st->Ist.Dirty.details->guard)) {
+         /* This is a dirty helper which will never get called.
+            Delete it. */
+         bb->stmts[i] = IRStmt_NoOp();
+       }
+       else {
+         /* Note any IRTemp uses made by the current statement. */
+         addUses_Stmt(set, st);
+      }
+   }
+
+   /* Optional second pass: if any unconditional exits were found, 
+      delete them and all following statements. */
+
+   if (i_unconditional_exit != -1) {
+      if (0) vex_printf("ZAPPING ALL FORWARDS from %d\n", 
+                        i_unconditional_exit);
+      vassert(i_unconditional_exit >= 0 
+              && i_unconditional_exit < bb->stmts_used);
+      bb->next 
+         = IRExpr_Const( bb->stmts[i_unconditional_exit]->Ist.Exit.dst );
+      bb->jumpkind
+         = bb->stmts[i_unconditional_exit]->Ist.Exit.jk;
+      for (i = i_unconditional_exit; i < bb->stmts_used; i++)
+         bb->stmts[i] = IRStmt_NoOp();
+   }
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Specialisation of helper function calls, in             ---*/
+/*--- collaboration with the front end                        ---*/
+/*---------------------------------------------------------------*/
+
+static 
+IRSB* spec_helpers_BB(
+         IRSB* bb,
+         IRExpr* (*specHelper) (HChar*, IRExpr**, IRStmt**, Int)
+      )
+{
+   Int     i;
+   IRStmt* st;
+   IRExpr* ex;
+   Bool    any = False;
+
+   for (i = bb->stmts_used-1; i >= 0; i--) {
+      st = bb->stmts[i];
+
+      if (st->tag != Ist_WrTmp
+          || st->Ist.WrTmp.data->tag != Iex_CCall)
+         continue;
+
+      ex = (*specHelper)( st->Ist.WrTmp.data->Iex.CCall.cee->name,
+                          st->Ist.WrTmp.data->Iex.CCall.args,
+                          &bb->stmts[0], i );
+      if (!ex)
+        /* the front end can't think of a suitable replacement */
+        continue;
+
+      /* We got something better.  Install it in the bb. */
+      any = True;
+      bb->stmts[i]
+         = IRStmt_WrTmp(st->Ist.WrTmp.tmp, ex);
+
+      if (0) {
+         vex_printf("SPEC: ");
+         ppIRExpr(st->Ist.WrTmp.data);
+         vex_printf("  -->  ");
+         ppIRExpr(ex);
+         vex_printf("\n");
+      }
+   }
+
+   if (any)
+      bb = flatten_BB(bb);
+   return bb;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Determination of guest state aliasing relationships     ---*/
+/*---------------------------------------------------------------*/
+
+/* These are helper functions for CSE and GetI/PutI transformations.
+
+   Determine, to the extent possible, the relationship between two
+   guest state accesses.  The possible outcomes are:
+
+   * Exact alias.  These two accesses denote precisely the same
+     piece of the guest state.
+
+   * Definitely no alias.  These two accesses are guaranteed not to
+     overlap any part of the guest state.
+
+   * Unknown -- if neither of the above can be established.
+
+   If in doubt, return Unknown.  */
+
+typedef
+   enum { ExactAlias, NoAlias, UnknownAlias }
+   GSAliasing;
+
+
+/* Produces the alias relation between an indexed guest
+   state access and a non-indexed access. */
+
+static
+GSAliasing getAliasingRelation_IC ( IRRegArray* descr1, IRExpr* ix1,
+                                    Int offset2, IRType ty2 )
+{
+   UInt minoff1, maxoff1, minoff2, maxoff2;
+
+   getArrayBounds( descr1, &minoff1, &maxoff1 );
+   minoff2 = offset2;
+   maxoff2 = minoff2 + sizeofIRType(ty2) - 1;
+
+   if (maxoff1 < minoff2 || maxoff2 < minoff1)
+      return NoAlias;
+
+   /* Could probably do better here if required.  For the moment
+      however just claim not to know anything more. */
+   return UnknownAlias;
+}
+
+
+/* Produces the alias relation between two indexed guest state
+   accesses. */
+
+static
+GSAliasing getAliasingRelation_II ( 
+              IRRegArray* descr1, IRExpr* ix1, Int bias1,
+              IRRegArray* descr2, IRExpr* ix2, Int bias2
+           )
+{
+   UInt minoff1, maxoff1, minoff2, maxoff2;
+   Int  iters;
+
+   /* First try hard to show they don't alias. */
+   getArrayBounds( descr1, &minoff1, &maxoff1 );
+   getArrayBounds( descr2, &minoff2, &maxoff2 );
+   if (maxoff1 < minoff2 || maxoff2 < minoff1)
+      return NoAlias;
+
+   /* So the two arrays at least partially overlap.  To get any
+      further we'll have to be sure that the descriptors are
+      identical. */
+   if (!eqIRRegArray(descr1, descr2))
+      return UnknownAlias;
+
+   /* The descriptors are identical.  Now the only difference can be
+      in the index expressions.  If they cannot be shown to be
+      identical, we have to say we don't know what the aliasing
+      relation will be.  Now, since the IR is flattened, the index
+      expressions should be atoms -- either consts or tmps.  So that
+      makes the comparison simple. */
+   vassert(isIRAtom(ix1));
+   vassert(isIRAtom(ix2));
+   if (!eqIRAtom(ix1,ix2))
+      return UnknownAlias;
+
+   /* Ok, the index expressions are identical.  So now the only way
+      they can be different is in the bias.  Normalise this
+      paranoidly, to reliably establish equality/non-equality. */
+
+   /* So now we know that the GetI and PutI index the same array
+      with the same base.  Are the offsets the same, modulo the
+      array size?  Do this paranoidly. */
+   vassert(descr1->nElems == descr2->nElems);
+   vassert(descr1->elemTy == descr2->elemTy);
+   vassert(descr1->base   == descr2->base);
+   iters = 0;
+   while (bias1 < 0 || bias2 < 0) {
+      bias1 += descr1->nElems;
+      bias2 += descr1->nElems;
+      iters++;
+      if (iters > 10)
+         vpanic("getAliasingRelation: iters");
+   }
+   vassert(bias1 >= 0 && bias2 >= 0);
+   bias1 %= descr1->nElems;
+   bias2 %= descr1->nElems;
+   vassert(bias1 >= 0 && bias1 < descr1->nElems);
+   vassert(bias2 >= 0 && bias2 < descr1->nElems);
+
+   /* Finally, biasP and biasG are normalised into the range 
+      0 .. descrP/G->nElems - 1.  And so we can establish
+      equality/non-equality. */
+
+   return bias1==bias2 ? ExactAlias : NoAlias;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Common Subexpression Elimination                        ---*/
+/*---------------------------------------------------------------*/
+
+/* Expensive in time and space. */
+
+/* Uses two environments: 
+   a IRTemp -> IRTemp mapping 
+   a mapping from AvailExpr* to IRTemp 
+*/
+
+typedef
+   struct {
+      enum { Ut, Btt, Btc, Bct, Cf64i, Mttt, GetIt } tag;
+      union {
+         /* unop(tmp) */
+         struct {
+            IROp   op;
+            IRTemp arg;
+         } Ut;
+         /* binop(tmp,tmp) */
+         struct {
+            IROp   op;
+            IRTemp arg1;
+            IRTemp arg2;
+         } Btt;
+         /* binop(tmp,const) */
+         struct {
+            IROp    op;
+            IRTemp  arg1;
+            IRConst con2;
+         } Btc;
+         /* binop(const,tmp) */
+         struct {
+            IROp    op;
+            IRConst con1;
+            IRTemp  arg2;
+         } Bct;
+         /* F64i-style const */
+         struct {
+            ULong f64i;
+         } Cf64i;
+         /* Mux0X(tmp,tmp,tmp) */
+         struct {
+            IRTemp co;
+            IRTemp e0;
+            IRTemp eX;
+         } Mttt;
+         /* GetI(descr,tmp,bias)*/
+         struct {
+            IRRegArray* descr;
+            IRTemp      ix;
+            Int         bias;
+         } GetIt;
+      } u;
+   }
+   AvailExpr;
+
+static Bool eq_AvailExpr ( AvailExpr* a1, AvailExpr* a2 )
+{
+   if (a1->tag != a2->tag)
+      return False;
+   switch (a1->tag) {
+      case Ut: 
+         return toBool(
+                a1->u.Ut.op == a2->u.Ut.op 
+                && a1->u.Ut.arg == a2->u.Ut.arg);
+      case Btt: 
+         return toBool(
+                a1->u.Btt.op == a2->u.Btt.op
+                && a1->u.Btt.arg1 == a2->u.Btt.arg1
+                && a1->u.Btt.arg2 == a2->u.Btt.arg2);
+      case Btc: 
+         return toBool(
+                a1->u.Btc.op == a2->u.Btc.op
+                && a1->u.Btc.arg1 == a2->u.Btc.arg1
+                && eqIRConst(&a1->u.Btc.con2, &a2->u.Btc.con2));
+      case Bct: 
+         return toBool(
+                a1->u.Bct.op == a2->u.Bct.op
+                && a1->u.Bct.arg2 == a2->u.Bct.arg2
+                && eqIRConst(&a1->u.Bct.con1, &a2->u.Bct.con1));
+      case Cf64i: 
+         return toBool(a1->u.Cf64i.f64i == a2->u.Cf64i.f64i);
+      case Mttt:
+         return toBool(a1->u.Mttt.co == a2->u.Mttt.co
+                       && a1->u.Mttt.e0 == a2->u.Mttt.e0
+                       && a1->u.Mttt.eX == a2->u.Mttt.eX);
+      case GetIt:
+         return toBool(eqIRRegArray(a1->u.GetIt.descr, a2->u.GetIt.descr) 
+                       && a1->u.GetIt.ix == a2->u.GetIt.ix
+                       && a1->u.GetIt.bias == a2->u.GetIt.bias);
+      default: vpanic("eq_AvailExpr");
+   }
+}
+
+static IRExpr* availExpr_to_IRExpr ( AvailExpr* ae ) 
+{
+   IRConst* con;
+   switch (ae->tag) {
+      case Ut:
+         return IRExpr_Unop( ae->u.Ut.op, IRExpr_RdTmp(ae->u.Ut.arg) );
+      case Btt:
+         return IRExpr_Binop( ae->u.Btt.op,
+                              IRExpr_RdTmp(ae->u.Btt.arg1),
+                              IRExpr_RdTmp(ae->u.Btt.arg2) );
+      case Btc:
+         con = LibVEX_Alloc(sizeof(IRConst));
+         *con = ae->u.Btc.con2;
+         return IRExpr_Binop( ae->u.Btc.op,
+                              IRExpr_RdTmp(ae->u.Btc.arg1), 
+                              IRExpr_Const(con) );
+      case Bct:
+         con = LibVEX_Alloc(sizeof(IRConst));
+         *con = ae->u.Bct.con1;
+         return IRExpr_Binop( ae->u.Bct.op,
+                              IRExpr_Const(con), 
+                              IRExpr_RdTmp(ae->u.Bct.arg2) );
+      case Cf64i:
+         return IRExpr_Const(IRConst_F64i(ae->u.Cf64i.f64i));
+      case Mttt:
+         return IRExpr_Mux0X(IRExpr_RdTmp(ae->u.Mttt.co), 
+                             IRExpr_RdTmp(ae->u.Mttt.e0), 
+                             IRExpr_RdTmp(ae->u.Mttt.eX));
+      case GetIt:
+         return IRExpr_GetI(ae->u.GetIt.descr,
+                            IRExpr_RdTmp(ae->u.GetIt.ix),
+                            ae->u.GetIt.bias);
+      default:
+         vpanic("availExpr_to_IRExpr");
+   }
+}
+
+inline
+static IRTemp subst_AvailExpr_Temp ( HashHW* env, IRTemp tmp )
+{
+   HWord res;
+   /* env :: IRTemp -> IRTemp */
+   if (lookupHHW( env, &res, (HWord)tmp ))
+      return (IRTemp)res;
+   else
+      return tmp;
+}
+
+static void subst_AvailExpr ( HashHW* env, AvailExpr* ae )
+{
+   /* env :: IRTemp -> IRTemp */
+   switch (ae->tag) {
+      case Ut:
+         ae->u.Ut.arg = subst_AvailExpr_Temp( env, ae->u.Ut.arg );
+         break;
+      case Btt:
+         ae->u.Btt.arg1 = subst_AvailExpr_Temp( env, ae->u.Btt.arg1 );
+         ae->u.Btt.arg2 = subst_AvailExpr_Temp( env, ae->u.Btt.arg2 );
+         break;
+      case Btc:
+         ae->u.Btc.arg1 = subst_AvailExpr_Temp( env, ae->u.Btc.arg1 );
+         break;
+      case Bct:
+         ae->u.Bct.arg2 = subst_AvailExpr_Temp( env, ae->u.Bct.arg2 );
+         break;
+      case Cf64i:
+         break;
+      case Mttt:
+         ae->u.Mttt.co = subst_AvailExpr_Temp( env, ae->u.Mttt.co );
+         ae->u.Mttt.e0 = subst_AvailExpr_Temp( env, ae->u.Mttt.e0 );
+         ae->u.Mttt.eX = subst_AvailExpr_Temp( env, ae->u.Mttt.eX );
+         break;
+      case GetIt:
+         ae->u.GetIt.ix = subst_AvailExpr_Temp( env, ae->u.GetIt.ix );
+         break;
+      default: 
+         vpanic("subst_AvailExpr");
+   }
+}
+
+static AvailExpr* irExpr_to_AvailExpr ( IRExpr* e )
+{
+   AvailExpr* ae;
+
+   if (e->tag == Iex_Unop
+       && e->Iex.Unop.arg->tag == Iex_RdTmp) {
+      ae = LibVEX_Alloc(sizeof(AvailExpr));
+      ae->tag      = Ut;
+      ae->u.Ut.op  = e->Iex.Unop.op;
+      ae->u.Ut.arg = e->Iex.Unop.arg->Iex.RdTmp.tmp;
+      return ae;
+   }
+
+   if (e->tag == Iex_Binop
+       && e->Iex.Binop.arg1->tag == Iex_RdTmp
+       && e->Iex.Binop.arg2->tag == Iex_RdTmp) {
+      ae = LibVEX_Alloc(sizeof(AvailExpr));
+      ae->tag        = Btt;
+      ae->u.Btt.op   = e->Iex.Binop.op;
+      ae->u.Btt.arg1 = e->Iex.Binop.arg1->Iex.RdTmp.tmp;
+      ae->u.Btt.arg2 = e->Iex.Binop.arg2->Iex.RdTmp.tmp;
+      return ae;
+   }
+
+   if (e->tag == Iex_Binop
+      && e->Iex.Binop.arg1->tag == Iex_RdTmp
+      && e->Iex.Binop.arg2->tag == Iex_Const) {
+      ae = LibVEX_Alloc(sizeof(AvailExpr));
+      ae->tag        = Btc;
+      ae->u.Btc.op   = e->Iex.Binop.op;
+      ae->u.Btc.arg1 = e->Iex.Binop.arg1->Iex.RdTmp.tmp;
+      ae->u.Btc.con2 = *(e->Iex.Binop.arg2->Iex.Const.con);
+      return ae;
+   }
+
+   if (e->tag == Iex_Binop
+      && e->Iex.Binop.arg1->tag == Iex_Const
+      && e->Iex.Binop.arg2->tag == Iex_RdTmp) {
+      ae = LibVEX_Alloc(sizeof(AvailExpr));
+      ae->tag        = Bct;
+      ae->u.Bct.op   = e->Iex.Binop.op;
+      ae->u.Bct.arg2 = e->Iex.Binop.arg2->Iex.RdTmp.tmp;
+      ae->u.Bct.con1 = *(e->Iex.Binop.arg1->Iex.Const.con);
+      return ae;
+   }
+
+   if (e->tag == Iex_Const
+       && e->Iex.Const.con->tag == Ico_F64i) {
+      ae = LibVEX_Alloc(sizeof(AvailExpr));
+      ae->tag          = Cf64i;
+      ae->u.Cf64i.f64i = e->Iex.Const.con->Ico.F64i;
+      return ae;
+   }
+
+   if (e->tag == Iex_Mux0X
+       && e->Iex.Mux0X.cond->tag == Iex_RdTmp
+       && e->Iex.Mux0X.expr0->tag == Iex_RdTmp
+       && e->Iex.Mux0X.exprX->tag == Iex_RdTmp) {
+      ae = LibVEX_Alloc(sizeof(AvailExpr));
+      ae->tag       = Mttt;
+      ae->u.Mttt.co = e->Iex.Mux0X.cond->Iex.RdTmp.tmp;
+      ae->u.Mttt.e0 = e->Iex.Mux0X.expr0->Iex.RdTmp.tmp;
+      ae->u.Mttt.eX = e->Iex.Mux0X.exprX->Iex.RdTmp.tmp;
+      return ae;
+   }
+
+   if (e->tag == Iex_GetI
+       && e->Iex.GetI.ix->tag == Iex_RdTmp) {
+      ae = LibVEX_Alloc(sizeof(AvailExpr));
+      ae->tag           = GetIt;
+      ae->u.GetIt.descr = e->Iex.GetI.descr;
+      ae->u.GetIt.ix    = e->Iex.GetI.ix->Iex.RdTmp.tmp;
+      ae->u.GetIt.bias  = e->Iex.GetI.bias;
+      return ae;
+   }
+
+   return NULL;
+}
+
+
+/* The BB is modified in-place.  Returns True if any changes were
+   made. */
+
+static Bool do_cse_BB ( IRSB* bb )
+{
+   Int        i, j, paranoia;
+   IRTemp     t, q;
+   IRStmt*    st;
+   AvailExpr* eprime;
+   AvailExpr* ae;
+   Bool       invalidate;
+   Bool       anyDone = False;
+
+   HashHW* tenv = newHHW(); /* :: IRTemp -> IRTemp */
+   HashHW* aenv = newHHW(); /* :: AvailExpr* -> IRTemp */
+
+   vassert(sizeof(IRTemp) <= sizeof(HWord));
+
+   if (0) { ppIRSB(bb); vex_printf("\n\n"); }
+
+   /* Iterate forwards over the stmts.  
+      On seeing "t = E", where E is one of the 5 AvailExpr forms:
+         let E' = apply tenv substitution to E
+         search aenv for E'
+            if a mapping E' -> q is found, 
+               replace this stmt by "t = q"
+               and add binding t -> q to tenv
+            else
+               add binding E' -> t to aenv
+               replace this stmt by "t = E'"
+
+      Other statements are only interesting to the extent that they
+      might invalidate some of the expressions in aenv.  So there is
+      an invalidate-bindings check for each statement seen.
+   */
+   for (i = 0; i < bb->stmts_used; i++) {
+      st = bb->stmts[i];
+
+      /* ------ BEGIN invalidate aenv bindings ------ */
+      /* This is critical: remove from aenv any E' -> .. bindings
+         which might be invalidated by this statement.  The only
+         vulnerable kind of bindings are the GetI kind.
+            Dirty call - dump (paranoia level -> 2) 
+            Store      - dump (ditto) 
+            Put, PutI  - dump unless no-overlap is proven (.. -> 1)
+         Uses getAliasingRelation_IC and getAliasingRelation_II
+         to do the no-overlap assessments needed for Put/PutI.
+      */
+      switch (st->tag) {
+         case Ist_Dirty: case Ist_Store: case Ist_MBE:
+         case Ist_CAS: case Ist_LLSC:
+            paranoia = 2; break;
+         case Ist_Put: case Ist_PutI: 
+            paranoia = 1; break;
+         case Ist_NoOp: case Ist_IMark: case Ist_AbiHint: 
+         case Ist_WrTmp: case Ist_Exit: 
+            paranoia = 0; break;
+         default: 
+            vpanic("do_cse_BB(1)");
+      }
+
+      if (paranoia > 0) {
+         for (j = 0; j < aenv->used; j++) {
+            if (!aenv->inuse[j])
+               continue;
+            ae = (AvailExpr*)aenv->key[j];
+            if (ae->tag != GetIt) 
+               continue;
+            invalidate = False;
+            if (paranoia >= 2) {
+               invalidate = True;
+            } else {
+               vassert(paranoia == 1);
+               if (st->tag == Ist_Put) {
+                  if (getAliasingRelation_IC(
+                         ae->u.GetIt.descr, 
+                         IRExpr_RdTmp(ae->u.GetIt.ix), 
+                         st->Ist.Put.offset, 
+                         typeOfIRExpr(bb->tyenv,st->Ist.Put.data) 
+                      ) != NoAlias) 
+                     invalidate = True;
+               }
+               else 
+               if (st->tag == Ist_PutI) {
+                  if (getAliasingRelation_II(
+                         ae->u.GetIt.descr, 
+                         IRExpr_RdTmp(ae->u.GetIt.ix), 
+                         ae->u.GetIt.bias,
+                         st->Ist.PutI.descr,
+                         st->Ist.PutI.ix,
+                         st->Ist.PutI.bias
+                      ) != NoAlias)
+                     invalidate = True;
+               }
+               else 
+                  vpanic("do_cse_BB(2)");
+            }
+
+            if (invalidate) {
+               aenv->inuse[j] = False;
+               aenv->key[j]   = (HWord)NULL;  /* be sure */
+            }
+         } /* for j */
+      } /* paranoia > 0 */
+
+      /* ------ ENV invalidate aenv bindings ------ */
+
+      /* ignore not-interestings */
+      if (st->tag != Ist_WrTmp)
+         continue;
+
+      t = st->Ist.WrTmp.tmp;
+      eprime = irExpr_to_AvailExpr(st->Ist.WrTmp.data);
+      /* ignore if not of AvailExpr form */
+      if (!eprime)
+         continue;
+
+      /* vex_printf("considering: " ); ppIRStmt(st); vex_printf("\n"); */
+
+      /* apply tenv */
+      subst_AvailExpr( tenv, eprime );
+
+      /* search aenv for eprime, unfortunately the hard way */
+      for (j = 0; j < aenv->used; j++)
+         if (aenv->inuse[j] && eq_AvailExpr(eprime, (AvailExpr*)aenv->key[j]))
+            break;
+
+      if (j < aenv->used) {
+         /* A binding E' -> q was found.  Replace stmt by "t = q" and
+            note the t->q binding in tenv. */
+         /* (this is the core of the CSE action) */
+         q = (IRTemp)aenv->val[j];
+         bb->stmts[i] = IRStmt_WrTmp( t, IRExpr_RdTmp(q) );
+         addToHHW( tenv, (HWord)t, (HWord)q );
+         anyDone = True;
+      } else {
+         /* No binding was found, so instead we add E' -> t to our
+            collection of available expressions, replace this stmt
+            with "t = E'", and move on. */
+         bb->stmts[i] = IRStmt_WrTmp( t, availExpr_to_IRExpr(eprime) );
+         addToHHW( aenv, (HWord)eprime, (HWord)t );
+      }
+   }
+
+   /*
+   ppIRSB(bb);
+   sanityCheckIRSB(bb, Ity_I32);
+   vex_printf("\n\n");
+   */
+   return anyDone;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Add32/Sub32 chain collapsing                            ---*/
+/*---------------------------------------------------------------*/
+
+/* ----- Helper functions for Add32/Sub32 chain collapsing ----- */
+
+/* Is this expression "Add32(tmp,const)" or "Sub32(tmp,const)" ?  If
+   yes, set *tmp and *i32 appropriately.  *i32 is set as if the
+   root node is Add32, not Sub32. */
+
+static Bool isAdd32OrSub32 ( IRExpr* e, IRTemp* tmp, Int* i32 )
+{ 
+   if (e->tag != Iex_Binop)
+      return False;
+   if (e->Iex.Binop.op != Iop_Add32 && e->Iex.Binop.op != Iop_Sub32)
+      return False;
+   if (e->Iex.Binop.arg1->tag != Iex_RdTmp)
+      return False;
+   if (e->Iex.Binop.arg2->tag != Iex_Const)
+      return False;
+   *tmp = e->Iex.Binop.arg1->Iex.RdTmp.tmp;
+   *i32 = (Int)(e->Iex.Binop.arg2->Iex.Const.con->Ico.U32);
+   if (e->Iex.Binop.op == Iop_Sub32)
+      *i32 = -*i32;
+   return True;
+}
+
+
+/* Figure out if tmp can be expressed as tmp2 +32 const, for some
+   other tmp2.  Scan backwards from the specified start point -- an
+   optimisation. */
+
+static Bool collapseChain ( IRSB* bb, Int startHere,
+                            IRTemp tmp,
+                            IRTemp* tmp2, Int* i32 )
+{
+   Int     j, ii;
+   IRTemp  vv;
+   IRStmt* st;
+   IRExpr* e;
+
+   /* the (var, con) pair contain the current 'representation' for
+      'tmp'.  We start with 'tmp + 0'.  */
+   IRTemp var = tmp;
+   Int    con = 0;
+
+   /* Scan backwards to see if tmp can be replaced by some other tmp
+     +/- a constant. */
+   for (j = startHere; j >= 0; j--) {
+      st = bb->stmts[j];
+      if (st->tag != Ist_WrTmp) 
+         continue;
+      if (st->Ist.WrTmp.tmp != var)
+         continue;
+      e = st->Ist.WrTmp.data;
+      if (!isAdd32OrSub32(e, &vv, &ii))
+         break;
+      var = vv;
+      con += ii;
+   }
+   if (j == -1)
+      /* no earlier binding for var .. ill-formed IR */
+      vpanic("collapseChain");
+
+   /* so, did we find anything interesting? */
+   if (var == tmp)
+      return False; /* no .. */
+      
+   *tmp2 = var;
+   *i32  = con;
+   return True;
+}
+
+
+/* ------- Main function for Add32/Sub32 chain collapsing ------ */
+
+static void collapse_AddSub_chains_BB ( IRSB* bb )
+{
+   IRStmt *st;
+   IRTemp var, var2;
+   Int    i, con, con2;
+
+   for (i = bb->stmts_used-1; i >= 0; i--) {
+      st = bb->stmts[i];
+      if (st->tag == Ist_NoOp)
+         continue;
+
+      /* Try to collapse 't1 = Add32/Sub32(t2, con)'. */
+
+      if (st->tag == Ist_WrTmp
+          && isAdd32OrSub32(st->Ist.WrTmp.data, &var, &con)) {
+
+         /* So e1 is of the form Add32(var,con) or Sub32(var,-con).
+            Find out if var can be expressed as var2 + con2. */
+         if (collapseChain(bb, i-1, var, &var2, &con2)) {
+            if (DEBUG_IROPT) {
+               vex_printf("replacing1 ");
+               ppIRStmt(st);
+               vex_printf(" with ");
+            }
+            con2 += con;
+            bb->stmts[i] 
+               = IRStmt_WrTmp(
+                    st->Ist.WrTmp.tmp,
+                    (con2 >= 0) 
+                      ? IRExpr_Binop(Iop_Add32, 
+                                     IRExpr_RdTmp(var2),
+                                     IRExpr_Const(IRConst_U32(con2)))
+                      : IRExpr_Binop(Iop_Sub32, 
+                                     IRExpr_RdTmp(var2),
+                                     IRExpr_Const(IRConst_U32(-con2)))
+                 );
+            if (DEBUG_IROPT) {
+               ppIRStmt(bb->stmts[i]);
+               vex_printf("\n");
+            }
+         }
+
+         continue;
+      }
+
+      /* Try to collapse 't1 = GetI[t2, con]'. */
+
+      if (st->tag == Ist_WrTmp
+          && st->Ist.WrTmp.data->tag == Iex_GetI
+          && st->Ist.WrTmp.data->Iex.GetI.ix->tag == Iex_RdTmp
+          && collapseChain(bb, i-1, st->Ist.WrTmp.data->Iex.GetI.ix
+                                      ->Iex.RdTmp.tmp, &var2, &con2)) {
+         if (DEBUG_IROPT) {
+            vex_printf("replacing3 ");
+            ppIRStmt(st);
+            vex_printf(" with ");
+         }
+         con2 += st->Ist.WrTmp.data->Iex.GetI.bias;
+         bb->stmts[i]
+            = IRStmt_WrTmp(
+                 st->Ist.WrTmp.tmp,
+                 IRExpr_GetI(st->Ist.WrTmp.data->Iex.GetI.descr,
+                             IRExpr_RdTmp(var2),
+                             con2));
+         if (DEBUG_IROPT) {
+            ppIRStmt(bb->stmts[i]);
+            vex_printf("\n");
+         }
+         continue;
+      }
+
+      /* Perhaps st is PutI[t, con] ? */
+
+      if (st->tag == Ist_PutI
+          && st->Ist.PutI.ix->tag == Iex_RdTmp
+          && collapseChain(bb, i-1, st->Ist.PutI.ix->Iex.RdTmp.tmp, 
+                               &var2, &con2)) {
+         if (DEBUG_IROPT) {
+            vex_printf("replacing2 ");
+            ppIRStmt(st);
+            vex_printf(" with ");
+         }
+         con2 += st->Ist.PutI.bias;
+         bb->stmts[i]
+           = IRStmt_PutI(st->Ist.PutI.descr,
+                         IRExpr_RdTmp(var2),
+                         con2,
+                         st->Ist.PutI.data);
+         if (DEBUG_IROPT) {
+            ppIRStmt(bb->stmts[i]);
+            vex_printf("\n");
+         }
+         continue;
+      }
+
+   } /* for */
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- PutI/GetI transformations                               ---*/
+/*---------------------------------------------------------------*/
+
+/* Given the parts (descr, tmp, bias) for a GetI, scan backwards from
+   the given starting point to find, if any, a PutI which writes
+   exactly the same piece of guest state, and so return the expression
+   that the PutI writes.  This is the core of PutI-GetI forwarding. */
+
+static 
+IRExpr* findPutI ( IRSB* bb, Int startHere,
+                   IRRegArray* descrG, IRExpr* ixG, Int biasG )
+{
+   Int        j;
+   IRStmt*    st;
+   GSAliasing relation;
+
+   if (0) {
+      vex_printf("\nfindPutI ");
+      ppIRRegArray(descrG);
+      vex_printf(" ");
+      ppIRExpr(ixG);
+      vex_printf(" %d\n", biasG);
+   }
+
+   /* Scan backwards in bb from startHere to find a suitable PutI
+      binding for (descrG, ixG, biasG), if any. */
+
+   for (j = startHere; j >= 0; j--) {
+      st = bb->stmts[j];
+      if (st->tag == Ist_NoOp) 
+         continue;
+
+      if (st->tag == Ist_Put) {
+         /* Non-indexed Put.  This can't give a binding, but we do
+            need to check it doesn't invalidate the search by
+            overlapping any part of the indexed guest state. */
+
+         relation
+            = getAliasingRelation_IC(
+                 descrG, ixG,
+                 st->Ist.Put.offset,
+                 typeOfIRExpr(bb->tyenv,st->Ist.Put.data) );
+
+         if (relation == NoAlias) {
+            /* we're OK; keep going */
+            continue;
+         } else {
+            /* relation == UnknownAlias || relation == ExactAlias */
+            /* If this assertion fails, we've found a Put which writes
+               an area of guest state which is read by a GetI.  Which
+               is unlikely (although not per se wrong). */
+            vassert(relation != ExactAlias);
+            /* This Put potentially writes guest state that the GetI
+               reads; we must fail. */
+            return NULL;
+         }
+      }
+
+      if (st->tag == Ist_PutI) {
+
+         relation = getAliasingRelation_II(
+                       descrG, ixG, biasG,
+                       st->Ist.PutI.descr,
+                       st->Ist.PutI.ix,
+                       st->Ist.PutI.bias
+                    );
+
+         if (relation == NoAlias) {
+            /* This PutI definitely doesn't overlap.  Ignore it and
+               keep going. */
+            continue; /* the for j loop */
+         }
+
+         if (relation == UnknownAlias) {
+            /* We don't know if this PutI writes to the same guest
+               state that the GetI, or not.  So we have to give up. */
+            return NULL;
+         }
+
+         /* Otherwise, we've found what we're looking for.  */
+         vassert(relation == ExactAlias);
+         return st->Ist.PutI.data;
+
+      } /* if (st->tag == Ist_PutI) */
+
+      if (st->tag == Ist_Dirty) {
+         /* Be conservative.  If the dirty call has any guest effects at
+            all, give up.  We could do better -- only give up if there
+            are any guest writes/modifies. */
+         if (st->Ist.Dirty.details->nFxState > 0)
+            return NULL;
+      }
+
+   } /* for */
+
+   /* No valid replacement was found. */
+   return NULL;
+}
+
+
+
+/* Assuming pi is a PutI stmt, is s2 identical to it (in the sense
+   that it writes exactly the same piece of guest state) ?  Safe
+   answer: False. */
+
+static Bool identicalPutIs ( IRStmt* pi, IRStmt* s2 )
+{
+   vassert(pi->tag == Ist_PutI);
+   if (s2->tag != Ist_PutI)
+      return False;
+
+   return toBool(
+          getAliasingRelation_II( 
+             pi->Ist.PutI.descr, pi->Ist.PutI.ix, pi->Ist.PutI.bias, 
+             s2->Ist.PutI.descr, s2->Ist.PutI.ix, s2->Ist.PutI.bias
+          )
+          == ExactAlias
+          );
+}
+
+
+/* Assuming pi is a PutI stmt, is s2 a Get/GetI/Put/PutI which might
+   overlap it?  Safe answer: True.  Note, we could do a lot better
+   than this if needed. */
+
+static 
+Bool guestAccessWhichMightOverlapPutI ( 
+        IRTypeEnv* tyenv, IRStmt* pi, IRStmt* s2 
+     )
+{
+   GSAliasing relation;
+   UInt       minoffP, maxoffP;
+
+   vassert(pi->tag == Ist_PutI);
+   getArrayBounds(pi->Ist.PutI.descr, &minoffP, &maxoffP);
+   switch (s2->tag) {
+
+      case Ist_NoOp:
+      case Ist_IMark:
+         return False;
+
+      case Ist_MBE:
+      case Ist_AbiHint:
+         /* just be paranoid ... these should be rare. */
+         return True;
+
+      case Ist_CAS:
+         /* This is unbelievably lame, but it's probably not
+            significant from a performance point of view.  Really, a
+            CAS is a load-store op, so it should be safe to say False.
+            However .. */
+         return True;
+
+      case Ist_Dirty:
+         /* If the dirty call has any guest effects at all, give up.
+            Probably could do better. */
+         if (s2->Ist.Dirty.details->nFxState > 0)
+            return True;
+         return False;
+
+      case Ist_Put:
+         vassert(isIRAtom(s2->Ist.Put.data));
+         relation 
+            = getAliasingRelation_IC(
+                 pi->Ist.PutI.descr, pi->Ist.PutI.ix,
+                 s2->Ist.Put.offset, 
+                 typeOfIRExpr(tyenv,s2->Ist.Put.data)
+              );
+         goto have_relation;
+
+      case Ist_PutI:
+         vassert(isIRAtom(s2->Ist.PutI.ix));
+         vassert(isIRAtom(s2->Ist.PutI.data));
+         relation
+            = getAliasingRelation_II(
+                 pi->Ist.PutI.descr, pi->Ist.PutI.ix, pi->Ist.PutI.bias, 
+                 s2->Ist.PutI.descr, s2->Ist.PutI.ix, s2->Ist.PutI.bias
+              );
+         goto have_relation;
+
+      case Ist_WrTmp:
+         if (s2->Ist.WrTmp.data->tag == Iex_GetI) {
+            relation
+               = getAliasingRelation_II(
+                    pi->Ist.PutI.descr, pi->Ist.PutI.ix, 
+                                        pi->Ist.PutI.bias, 
+                    s2->Ist.WrTmp.data->Iex.GetI.descr,
+                    s2->Ist.WrTmp.data->Iex.GetI.ix,
+                    s2->Ist.WrTmp.data->Iex.GetI.bias
+                 );
+            goto have_relation;
+         }
+         if (s2->Ist.WrTmp.data->tag == Iex_Get) {
+            relation
+               = getAliasingRelation_IC(
+                    pi->Ist.PutI.descr, pi->Ist.PutI.ix,
+                    s2->Ist.WrTmp.data->Iex.Get.offset,
+                    s2->Ist.WrTmp.data->Iex.Get.ty
+                 );
+            goto have_relation;
+         }
+         return False;
+
+      case Ist_Store:
+         vassert(isIRAtom(s2->Ist.Store.addr));
+         vassert(isIRAtom(s2->Ist.Store.data));
+         return False;
+
+      default:
+         vex_printf("\n"); ppIRStmt(s2); vex_printf("\n");
+         vpanic("guestAccessWhichMightOverlapPutI");
+   }
+
+  have_relation:
+   if (relation == NoAlias)
+      return False;
+   else
+      return True; /* ExactAlias or UnknownAlias */
+}
+
+
+
+/* ---------- PutI/GetI transformations main functions --------- */
+
+/* Remove redundant GetIs, to the extent that they can be detected.
+   bb is modified in-place. */
+
+static
+void do_redundant_GetI_elimination ( IRSB* bb )
+{
+   Int     i;
+   IRStmt* st;
+
+   for (i = bb->stmts_used-1; i >= 0; i--) {
+      st = bb->stmts[i];
+      if (st->tag == Ist_NoOp)
+         continue;
+
+      if (st->tag == Ist_WrTmp
+          && st->Ist.WrTmp.data->tag == Iex_GetI
+          && st->Ist.WrTmp.data->Iex.GetI.ix->tag == Iex_RdTmp) {
+         IRRegArray* descr = st->Ist.WrTmp.data->Iex.GetI.descr;
+         IRExpr*     ix    = st->Ist.WrTmp.data->Iex.GetI.ix;
+         Int         bias  = st->Ist.WrTmp.data->Iex.GetI.bias;
+         IRExpr*     replacement = findPutI(bb, i-1, descr, ix, bias);
+         if (replacement 
+             && isIRAtom(replacement)
+             /* Make sure we're doing a type-safe transformation! */
+             && typeOfIRExpr(bb->tyenv, replacement) == descr->elemTy) {
+            if (DEBUG_IROPT) {
+               vex_printf("rGI:  "); 
+               ppIRExpr(st->Ist.WrTmp.data);
+               vex_printf(" -> ");
+               ppIRExpr(replacement);
+               vex_printf("\n");
+            }
+            bb->stmts[i] = IRStmt_WrTmp(st->Ist.WrTmp.tmp, replacement);
+         }
+      }
+   }
+
+}
+
+
+/* Remove redundant PutIs, to the extent which they can be detected.
+   bb is modified in-place. */
+
+static
+void do_redundant_PutI_elimination ( IRSB* bb )
+{
+   Int    i, j;
+   Bool   delete;
+   IRStmt *st, *stj;
+
+   for (i = 0; i < bb->stmts_used; i++) {
+      st = bb->stmts[i];
+      if (st->tag != Ist_PutI)
+         continue;
+      /* Ok, search forwards from here to see if we can find another
+         PutI which makes this one redundant, and dodging various 
+         hazards.  Search forwards:
+         * If conditional exit, give up (because anything after that 
+           does not postdominate this put).
+         * If a Get which might overlap, give up (because this PutI 
+           not necessarily dead).
+         * If a Put which is identical, stop with success.
+         * If a Put which might overlap, but is not identical, give up.
+         * If a dirty helper call which might write guest state, give up.
+         * If a Put which definitely doesn't overlap, or any other 
+           kind of stmt, continue.
+      */
+      delete = False;
+      for (j = i+1; j < bb->stmts_used; j++) {
+         stj = bb->stmts[j];
+         if (stj->tag == Ist_NoOp) 
+            continue;
+         if (identicalPutIs(st, stj)) {
+            /* success! */
+            delete = True;
+            break;
+         }
+         if (stj->tag == Ist_Exit)
+            /* give up */
+            break;
+         if (st->tag == Ist_Dirty)
+            /* give up; could do better here */
+            break;
+         if (guestAccessWhichMightOverlapPutI(bb->tyenv, st, stj))
+            /* give up */
+           break;
+      }
+
+      if (delete) {
+         if (DEBUG_IROPT) {
+            vex_printf("rPI:  "); 
+            ppIRStmt(st); 
+            vex_printf("\n");
+         }
+         bb->stmts[i] = IRStmt_NoOp();
+      }
+
+   }
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- Loop unrolling                                          ---*/
+/*---------------------------------------------------------------*/
+
+/* Adjust all tmp values (names) in e by delta.  e is destructively
+   modified. */
+
+static void deltaIRExpr ( IRExpr* e, Int delta )
+{
+   Int i;
+   switch (e->tag) {
+      case Iex_RdTmp:
+         e->Iex.RdTmp.tmp += delta;
+         break;
+      case Iex_Get:
+      case Iex_Const:
+         break;
+      case Iex_GetI:
+         deltaIRExpr(e->Iex.GetI.ix, delta);
+         break;
+      case Iex_Qop:
+         deltaIRExpr(e->Iex.Qop.arg1, delta);
+         deltaIRExpr(e->Iex.Qop.arg2, delta);
+         deltaIRExpr(e->Iex.Qop.arg3, delta);
+         deltaIRExpr(e->Iex.Qop.arg4, delta);
+         break;
+      case Iex_Triop:
+         deltaIRExpr(e->Iex.Triop.arg1, delta);
+         deltaIRExpr(e->Iex.Triop.arg2, delta);
+         deltaIRExpr(e->Iex.Triop.arg3, delta);
+         break;
+      case Iex_Binop:
+         deltaIRExpr(e->Iex.Binop.arg1, delta);
+         deltaIRExpr(e->Iex.Binop.arg2, delta);
+         break;
+      case Iex_Unop:
+         deltaIRExpr(e->Iex.Unop.arg, delta);
+         break;
+      case Iex_Load:
+         deltaIRExpr(e->Iex.Load.addr, delta);
+         break;
+      case Iex_CCall:
+         for (i = 0; e->Iex.CCall.args[i]; i++)
+            deltaIRExpr(e->Iex.CCall.args[i], delta);
+         break;
+      case Iex_Mux0X:
+         deltaIRExpr(e->Iex.Mux0X.cond, delta);
+         deltaIRExpr(e->Iex.Mux0X.expr0, delta);
+         deltaIRExpr(e->Iex.Mux0X.exprX, delta);
+         break;
+      default: 
+         vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+         vpanic("deltaIRExpr");
+   }
+}
+
+/* Adjust all tmp values (names) in st by delta.  st is destructively
+   modified. */
+
+static void deltaIRStmt ( IRStmt* st, Int delta )
+{
+   Int      i;
+   IRDirty* d;
+   switch (st->tag) {
+      case Ist_NoOp:
+      case Ist_IMark:
+      case Ist_MBE:
+         break;
+      case Ist_AbiHint:
+         deltaIRExpr(st->Ist.AbiHint.base, delta);
+         deltaIRExpr(st->Ist.AbiHint.nia, delta);
+         break;
+      case Ist_Put:
+         deltaIRExpr(st->Ist.Put.data, delta);
+         break;
+      case Ist_PutI:
+         deltaIRExpr(st->Ist.PutI.ix, delta);
+         deltaIRExpr(st->Ist.PutI.data, delta);
+         break;
+      case Ist_WrTmp: 
+         st->Ist.WrTmp.tmp += delta;
+         deltaIRExpr(st->Ist.WrTmp.data, delta);
+         break;
+      case Ist_Exit:
+         deltaIRExpr(st->Ist.Exit.guard, delta);
+         break;
+      case Ist_Store:
+         deltaIRExpr(st->Ist.Store.addr, delta);
+         deltaIRExpr(st->Ist.Store.data, delta);
+         break;
+      case Ist_CAS:
+         if (st->Ist.CAS.details->oldHi != IRTemp_INVALID)
+            st->Ist.CAS.details->oldHi += delta;
+         st->Ist.CAS.details->oldLo += delta;
+         deltaIRExpr(st->Ist.CAS.details->addr, delta);
+         if (st->Ist.CAS.details->expdHi)
+            deltaIRExpr(st->Ist.CAS.details->expdHi, delta);
+         deltaIRExpr(st->Ist.CAS.details->expdLo, delta);
+         if (st->Ist.CAS.details->dataHi)
+            deltaIRExpr(st->Ist.CAS.details->dataHi, delta);
+         deltaIRExpr(st->Ist.CAS.details->dataLo, delta);
+         break;
+      case Ist_LLSC:
+         st->Ist.LLSC.result += delta;
+         deltaIRExpr(st->Ist.LLSC.addr, delta);
+         if (st->Ist.LLSC.storedata)
+            deltaIRExpr(st->Ist.LLSC.storedata, delta);
+         break;
+      case Ist_Dirty:
+         d = st->Ist.Dirty.details;
+         deltaIRExpr(d->guard, delta);
+         for (i = 0; d->args[i]; i++)
+            deltaIRExpr(d->args[i], delta);
+         if (d->tmp != IRTemp_INVALID)
+            d->tmp += delta;
+         if (d->mAddr)
+            deltaIRExpr(d->mAddr, delta);
+         break;
+      default: 
+         vex_printf("\n"); ppIRStmt(st); vex_printf("\n");
+         vpanic("deltaIRStmt");
+   }
+}
+
+
+/* If possible, return a loop-unrolled version of bb0.  The original
+   is changed.  If not possible, return NULL.  */
+
+/* The two schemas considered are:
+
+     X: BODY; goto X
+
+     which unrolls to (eg)  X: BODY;BODY; goto X
+
+   and
+
+       X: BODY; if (c) goto X; goto Y
+   which trivially transforms to
+       X: BODY; if (!c) goto Y; goto X;
+   so it falls in the scope of the first case.  
+
+   X and Y must be literal (guest) addresses.
+*/
+
+static Int calc_unroll_factor( IRSB* bb )
+{
+   Int n_stmts, i;
+
+   n_stmts = 0;
+   for (i = 0; i < bb->stmts_used; i++) {
+      if (bb->stmts[i]->tag != Ist_NoOp)
+         n_stmts++;
+   }
+
+   if (n_stmts <= vex_control.iropt_unroll_thresh/8) {
+      if (vex_control.iropt_verbosity > 0)
+         vex_printf("vex iropt: 8 x unrolling (%d sts -> %d sts)\n",
+                    n_stmts, 8* n_stmts);
+      return 8;
+   }
+   if (n_stmts <= vex_control.iropt_unroll_thresh/4) {
+      if (vex_control.iropt_verbosity > 0)
+         vex_printf("vex iropt: 4 x unrolling (%d sts -> %d sts)\n",
+                    n_stmts, 4* n_stmts);
+      return 4;
+   }
+
+   if (n_stmts <= vex_control.iropt_unroll_thresh/2) {
+      if (vex_control.iropt_verbosity > 0)
+         vex_printf("vex iropt: 2 x unrolling (%d sts -> %d sts)\n",
+                    n_stmts, 2* n_stmts);
+      return 2;
+   }
+
+   if (vex_control.iropt_verbosity > 0)
+      vex_printf("vex iropt: not unrolling (%d sts)\n", n_stmts);
+
+   return 1;
+}
+
+
+static IRSB* maybe_loop_unroll_BB ( IRSB* bb0, Addr64 my_addr )
+{
+   Int      i, j, jmax, n_vars;
+   Bool     xxx_known;
+   Addr64   xxx_value, yyy_value;
+   IRExpr*  udst;
+   IRStmt*  st;
+   IRConst* con;
+   IRSB     *bb1, *bb2;
+   Int      unroll_factor;
+
+   if (vex_control.iropt_unroll_thresh <= 0)
+      return NULL;
+
+   /* First off, figure out if we can unroll this loop.  Do this
+      without modifying bb0. */
+
+   if (bb0->jumpkind != Ijk_Boring)
+      return NULL;
+
+   xxx_known = False;
+   xxx_value = 0;
+
+   /* Extract the next-guest address.  If it isn't a literal, we 
+      have to give up. */
+
+   udst = bb0->next;
+   if (udst->tag == Iex_Const
+       && (udst->Iex.Const.con->tag == Ico_U32
+           || udst->Iex.Const.con->tag == Ico_U64)) {
+      /* The BB ends in a jump to a literal location. */
+      xxx_known = True;
+      xxx_value = udst->Iex.Const.con->tag == Ico_U64
+                    ?  udst->Iex.Const.con->Ico.U64
+                    : (Addr64)(udst->Iex.Const.con->Ico.U32);
+   }
+
+   if (!xxx_known)
+      return NULL;
+
+   /* Now we know the BB ends to a jump to a literal location.  If
+      it's a jump to itself (viz, idiom #1), move directly to the
+      unrolling stage, first cloning the bb so the original isn't
+      modified. */
+   if (xxx_value == my_addr) {
+      unroll_factor = calc_unroll_factor( bb0 );
+      if (unroll_factor < 2)
+         return NULL;
+      bb1 = deepCopyIRSB( bb0 );
+      bb0 = NULL;
+      udst = NULL; /* is now invalid */
+      goto do_unroll;
+   }
+
+   /* Search for the second idiomatic form:
+        X: BODY; if (c) goto X; goto Y
+      We know Y, but need to establish that the last stmt
+      is 'if (c) goto X'.
+   */
+   yyy_value = xxx_value;
+   for (i = bb0->stmts_used-1; i >= 0; i--)
+      if (bb0->stmts[i])
+         break;
+
+   if (i < 0)
+      return NULL; /* block with no stmts.  Strange. */
+
+   st = bb0->stmts[i];
+   if (st->tag != Ist_Exit)
+      return NULL;
+   if (st->Ist.Exit.jk != Ijk_Boring)
+      return NULL;
+
+   con = st->Ist.Exit.dst;
+   vassert(con->tag == Ico_U32 || con->tag == Ico_U64);
+
+   xxx_value = con->tag == Ico_U64 
+                  ? st->Ist.Exit.dst->Ico.U64
+                  : (Addr64)(st->Ist.Exit.dst->Ico.U32);
+
+   /* If this assertion fails, we have some kind of type error. */
+   vassert(con->tag == udst->Iex.Const.con->tag);
+
+   if (xxx_value != my_addr)
+      /* We didn't find either idiom.  Give up. */
+      return NULL;
+
+   /* Ok, we found idiom #2.  Copy the BB, switch around the xxx and
+      yyy values (which makes it look like idiom #1), and go into
+      unrolling proper.  This means finding (again) the last stmt, in
+      the copied BB. */
+
+   unroll_factor = calc_unroll_factor( bb0 );
+   if (unroll_factor < 2)
+      return NULL;
+
+   bb1 = deepCopyIRSB( bb0 );
+   bb0 = NULL;
+   udst = NULL; /* is now invalid */
+   for (i = bb1->stmts_used-1; i >= 0; i--)
+      if (bb1->stmts[i])
+         break;
+
+   /* The next bunch of assertions should be true since we already
+      found and checked the last stmt in the original bb. */
+
+   vassert(i >= 0);
+
+   st = bb1->stmts[i];
+   vassert(st->tag == Ist_Exit);
+
+   con = st->Ist.Exit.dst;
+   vassert(con->tag == Ico_U32 || con->tag == Ico_U64);
+
+   udst = bb1->next;
+   vassert(udst->tag == Iex_Const);
+   vassert(udst->Iex.Const.con->tag == Ico_U32
+          || udst->Iex.Const.con->tag == Ico_U64);
+   vassert(con->tag == udst->Iex.Const.con->tag);
+
+   /* switch the xxx and yyy fields around */
+   if (con->tag == Ico_U64) {
+      udst->Iex.Const.con->Ico.U64 = xxx_value;
+      con->Ico.U64 = yyy_value;
+   } else {
+      udst->Iex.Const.con->Ico.U32 = (UInt)xxx_value;
+      con->Ico.U32 = (UInt)yyy_value;
+   }
+
+   /* negate the test condition */
+   st->Ist.Exit.guard 
+      = IRExpr_Unop(Iop_Not1,deepCopyIRExpr(st->Ist.Exit.guard));
+
+   /* --- The unroller proper.  Both idioms are by now --- */
+   /* --- now converted to idiom 1. --- */
+
+  do_unroll:
+
+   vassert(unroll_factor == 2 
+           || unroll_factor == 4
+           || unroll_factor == 8);
+
+   jmax = unroll_factor==8 ? 3 : (unroll_factor==4 ? 2 : 1);
+   for (j = 1; j <= jmax; j++) {
+
+      n_vars = bb1->tyenv->types_used;
+
+      bb2 = deepCopyIRSB(bb1);
+      for (i = 0; i < n_vars; i++)
+         (void)newIRTemp(bb1->tyenv, bb2->tyenv->types[i]);
+
+      for (i = 0; i < bb2->stmts_used; i++) {
+         /* deltaIRStmt destructively modifies the stmt, but 
+            that's OK since bb2 is a complete fresh copy of bb1. */
+         deltaIRStmt(bb2->stmts[i], n_vars);
+         addStmtToIRSB(bb1, bb2->stmts[i]);
+      }
+   }
+
+   if (DEBUG_IROPT) {
+      vex_printf("\nUNROLLED (%llx)\n", my_addr);
+      ppIRSB(bb1);
+      vex_printf("\n");
+   }
+
+   /* Flattening; sigh.  The unroller succeeds in breaking flatness
+      by negating the test condition.  This should be fixed properly.
+      For the moment use this shotgun approach.  */
+   return flatten_BB(bb1);
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- The tree builder                                        ---*/
+/*---------------------------------------------------------------*/
+
+/* This isn't part of IR optimisation.  Really it's a pass done prior
+   to instruction selection, which improves the code that the
+   instruction selector can produce. */
+
+/* --- The 'tmp' environment is the central data structure here --- */
+
+/* The number of outstanding bindings we're prepared to track.
+   The number of times the env becomes full and we have to dump
+   the oldest binding (hence reducing code quality) falls very
+   rapidly as the env size increases.  8 gives reasonable performance 
+   under most circumstances. */
+#define A_NENV 10
+
+/* bindee == NULL   ===  slot is not in use
+   bindee != NULL   ===  slot is in use
+*/
+typedef
+   struct {
+      IRTemp  binder;
+      IRExpr* bindee;
+      Bool    doesLoad;
+      Bool    doesGet;
+   }
+   ATmpInfo;
+
+__attribute__((unused))
+static void ppAEnv ( ATmpInfo* env )
+{
+   Int i;
+   for (i = 0; i < A_NENV; i++) {
+      vex_printf("%d  tmp %d  val ", i, (Int)env[i].binder);
+      if (env[i].bindee) 
+         ppIRExpr(env[i].bindee);
+      else 
+         vex_printf("(null)");
+      vex_printf("\n");
+   }
+}
+
+/* --- Tree-traversal fns --- */
+
+/* Traverse an expr, and detect if any part of it reads memory or does
+   a Get.  Be careful ... this really controls how much the
+   tree-builder can reorder the code, so getting it right is critical.
+*/
+static void setHints_Expr (Bool* doesLoad, Bool* doesGet, IRExpr* e )
+{
+   Int i;
+   switch (e->tag) {
+      case Iex_CCall:
+         for (i = 0; e->Iex.CCall.args[i]; i++)
+            setHints_Expr(doesLoad, doesGet, e->Iex.CCall.args[i]);
+         return;
+      case Iex_Mux0X:
+         setHints_Expr(doesLoad, doesGet, e->Iex.Mux0X.cond);
+         setHints_Expr(doesLoad, doesGet, e->Iex.Mux0X.expr0);
+         setHints_Expr(doesLoad, doesGet, e->Iex.Mux0X.exprX);
+         return;
+      case Iex_Qop:
+         setHints_Expr(doesLoad, doesGet, e->Iex.Qop.arg1);
+         setHints_Expr(doesLoad, doesGet, e->Iex.Qop.arg2);
+         setHints_Expr(doesLoad, doesGet, e->Iex.Qop.arg3);
+         setHints_Expr(doesLoad, doesGet, e->Iex.Qop.arg4);
+         return;
+      case Iex_Triop:
+         setHints_Expr(doesLoad, doesGet, e->Iex.Triop.arg1);
+         setHints_Expr(doesLoad, doesGet, e->Iex.Triop.arg2);
+         setHints_Expr(doesLoad, doesGet, e->Iex.Triop.arg3);
+         return;
+      case Iex_Binop:
+         setHints_Expr(doesLoad, doesGet, e->Iex.Binop.arg1);
+         setHints_Expr(doesLoad, doesGet, e->Iex.Binop.arg2);
+         return;
+      case Iex_Unop:
+         setHints_Expr(doesLoad, doesGet, e->Iex.Unop.arg);
+         return;
+      case Iex_Load:
+         *doesLoad = True;
+         setHints_Expr(doesLoad, doesGet, e->Iex.Load.addr);
+         return;
+      case Iex_Get:
+         *doesGet = True;
+         return;
+      case Iex_GetI:
+         *doesGet = True;
+         setHints_Expr(doesLoad, doesGet, e->Iex.GetI.ix);
+         return;
+      case Iex_RdTmp:
+      case Iex_Const:
+         return;
+      default: 
+         vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+         vpanic("setHints_Expr");
+   }
+}
+
+
+/* Add a binding to the front of the env and slide all the rest
+   backwards.  It should be the case that the last slot is free. */
+static void addToEnvFront ( ATmpInfo* env, IRTemp binder, IRExpr* bindee )
+{
+   Int i;
+   vassert(env[A_NENV-1].bindee == NULL);
+   for (i = A_NENV-1; i >= 1; i--)
+      env[i] = env[i-1];
+   env[0].binder   = binder;
+   env[0].bindee   = bindee;
+   env[0].doesLoad = False; /* filled in later */
+   env[0].doesGet  = False; /* filled in later */
+}
+
+/* Given uses :: array of UShort, indexed by IRTemp
+   Add the use-occurrences of temps in this expression 
+   to the env. 
+*/
+static void aoccCount_Expr ( UShort* uses, IRExpr* e )
+{
+   Int i;
+
+   switch (e->tag) {
+
+      case Iex_RdTmp: /* the only interesting case */
+         uses[e->Iex.RdTmp.tmp]++;
+         return;
+
+      case Iex_Mux0X:
+         aoccCount_Expr(uses, e->Iex.Mux0X.cond);
+         aoccCount_Expr(uses, e->Iex.Mux0X.expr0);
+         aoccCount_Expr(uses, e->Iex.Mux0X.exprX);
+         return;
+
+      case Iex_Qop: 
+         aoccCount_Expr(uses, e->Iex.Qop.arg1);
+         aoccCount_Expr(uses, e->Iex.Qop.arg2);
+         aoccCount_Expr(uses, e->Iex.Qop.arg3);
+         aoccCount_Expr(uses, e->Iex.Qop.arg4);
+         return;
+
+      case Iex_Triop: 
+         aoccCount_Expr(uses, e->Iex.Triop.arg1);
+         aoccCount_Expr(uses, e->Iex.Triop.arg2);
+         aoccCount_Expr(uses, e->Iex.Triop.arg3);
+         return;
+
+      case Iex_Binop: 
+         aoccCount_Expr(uses, e->Iex.Binop.arg1);
+         aoccCount_Expr(uses, e->Iex.Binop.arg2);
+         return;
+
+      case Iex_Unop: 
+         aoccCount_Expr(uses, e->Iex.Unop.arg);
+         return;
+
+      case Iex_Load:
+         aoccCount_Expr(uses, e->Iex.Load.addr);
+         return;
+
+      case Iex_CCall:
+         for (i = 0; e->Iex.CCall.args[i]; i++)
+            aoccCount_Expr(uses, e->Iex.CCall.args[i]);
+         return;
+
+      case Iex_GetI:
+         aoccCount_Expr(uses, e->Iex.GetI.ix);
+         return;
+
+      case Iex_Const:
+      case Iex_Get:
+         return;
+
+      default: 
+         vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+         vpanic("aoccCount_Expr");
+    }
+}
+
+
+/* Given uses :: array of UShort, indexed by IRTemp
+   Add the use-occurrences of temps in this statement 
+   to the env. 
+*/
+static void aoccCount_Stmt ( UShort* uses, IRStmt* st )
+{
+   Int      i;
+   IRDirty* d;
+   IRCAS*   cas;
+   switch (st->tag) {
+      case Ist_AbiHint:
+         aoccCount_Expr(uses, st->Ist.AbiHint.base);
+         aoccCount_Expr(uses, st->Ist.AbiHint.nia);
+         return;
+      case Ist_WrTmp: 
+         aoccCount_Expr(uses, st->Ist.WrTmp.data); 
+         return; 
+      case Ist_Put: 
+         aoccCount_Expr(uses, st->Ist.Put.data);
+         return;
+      case Ist_PutI:
+         aoccCount_Expr(uses, st->Ist.PutI.ix);
+         aoccCount_Expr(uses, st->Ist.PutI.data);
+         return;
+      case Ist_Store:
+         aoccCount_Expr(uses, st->Ist.Store.addr);
+         aoccCount_Expr(uses, st->Ist.Store.data);
+         return;
+      case Ist_CAS:
+         cas = st->Ist.CAS.details;
+         aoccCount_Expr(uses, cas->addr);
+         if (cas->expdHi)
+            aoccCount_Expr(uses, cas->expdHi);
+         aoccCount_Expr(uses, cas->expdLo);
+         if (cas->dataHi)
+            aoccCount_Expr(uses, cas->dataHi);
+         aoccCount_Expr(uses, cas->dataLo);
+         return;
+      case Ist_LLSC:
+         aoccCount_Expr(uses, st->Ist.LLSC.addr);
+         if (st->Ist.LLSC.storedata)
+            aoccCount_Expr(uses, st->Ist.LLSC.storedata);
+         return;
+      case Ist_Dirty:
+         d = st->Ist.Dirty.details;
+         if (d->mFx != Ifx_None)
+            aoccCount_Expr(uses, d->mAddr);
+         aoccCount_Expr(uses, d->guard);
+         for (i = 0; d->args[i]; i++)
+            aoccCount_Expr(uses, d->args[i]);
+         return;
+      case Ist_NoOp:
+      case Ist_IMark:
+      case Ist_MBE:
+         return;
+      case Ist_Exit:
+         aoccCount_Expr(uses, st->Ist.Exit.guard);
+         return;
+      default: 
+         vex_printf("\n"); ppIRStmt(st); vex_printf("\n");
+         vpanic("aoccCount_Stmt");
+   }
+}
+
+/* Look up a binding for tmp in the env.  If found, return the bound
+   expression, and set the env's binding to NULL so it is marked as
+   used.  If not found, return NULL. */
+
+static IRExpr* atbSubst_Temp ( ATmpInfo* env, IRTemp tmp )
+{
+   Int i;
+   for (i = 0; i < A_NENV; i++) {
+      if (env[i].binder == tmp && env[i].bindee != NULL) {
+         IRExpr* bindee = env[i].bindee;
+         env[i].bindee = NULL;
+         return bindee;
+      }
+   }
+   return NULL;
+}
+
+/* Traverse e, looking for temps.  For each observed temp, see if env
+   contains a binding for the temp, and if so return the bound value.
+   The env has the property that any binding it holds is
+   'single-shot', so once a binding is used, it is marked as no longer
+   available, by setting its .bindee field to NULL. */
+
+static inline Bool is_Unop ( IRExpr* e, IROp op ) {
+   return e->tag == Iex_Unop && e->Iex.Unop.op == op;
+}
+static inline Bool is_Binop ( IRExpr* e, IROp op ) {
+   return e->tag == Iex_Binop && e->Iex.Binop.op == op;
+}
+
+static IRExpr* fold_IRExpr_Binop ( IROp op, IRExpr* a1, IRExpr* a2 )
+{
+   switch (op) {
+   case Iop_Or32:
+      /* Or32( CmpwNEZ32(x), CmpwNEZ32(y) ) --> CmpwNEZ32( Or32( x, y ) )  */
+      if (is_Unop(a1, Iop_CmpwNEZ32) && is_Unop(a2, Iop_CmpwNEZ32))
+         return IRExpr_Unop( Iop_CmpwNEZ32,
+                             IRExpr_Binop( Iop_Or32, a1->Iex.Unop.arg, 
+                                                     a2->Iex.Unop.arg ) );
+      break;
+   default:
+      break;
+   }
+   /* no reduction rule applies */
+   return IRExpr_Binop( op, a1, a2 );
+}
+
+static IRExpr* fold_IRExpr_Unop ( IROp op, IRExpr* aa )
+{
+   switch (op) {
+   case Iop_CmpwNEZ64:
+      /* CmpwNEZ64( Or64 ( CmpwNEZ64(x), y ) ) --> CmpwNEZ64( Or64( x, y ) ) */
+      if (is_Binop(aa, Iop_Or64) 
+          && is_Unop(aa->Iex.Binop.arg1, Iop_CmpwNEZ64))
+         return fold_IRExpr_Unop(
+                   Iop_CmpwNEZ64,
+                   IRExpr_Binop(Iop_Or64, 
+                                aa->Iex.Binop.arg1->Iex.Unop.arg, 
+                                aa->Iex.Binop.arg2));
+      /* CmpwNEZ64( Or64 ( x, CmpwNEZ64(y) ) ) --> CmpwNEZ64( Or64( x, y ) ) */
+      if (is_Binop(aa, Iop_Or64)
+          && is_Unop(aa->Iex.Binop.arg2, Iop_CmpwNEZ64))
+         return fold_IRExpr_Unop(
+                   Iop_CmpwNEZ64,
+                   IRExpr_Binop(Iop_Or64, 
+                                aa->Iex.Binop.arg1, 
+                                aa->Iex.Binop.arg2->Iex.Unop.arg));
+      break;
+   case Iop_CmpNEZ64:
+      /* CmpNEZ64( Left64(x) ) --> CmpNEZ64(x) */
+      if (is_Unop(aa, Iop_Left64)) 
+         return IRExpr_Unop(Iop_CmpNEZ64, aa->Iex.Unop.arg);
+      break;
+   case Iop_CmpwNEZ32:
+      /* CmpwNEZ32( CmpwNEZ32 ( x ) ) --> CmpwNEZ32 ( x ) */
+      if (is_Unop(aa, Iop_CmpwNEZ32))
+         return IRExpr_Unop( Iop_CmpwNEZ32, aa->Iex.Unop.arg );
+      break;
+   case Iop_CmpNEZ32:
+      /* CmpNEZ32( Left32(x) ) --> CmpNEZ32(x) */
+      if (is_Unop(aa, Iop_Left32)) 
+         return IRExpr_Unop(Iop_CmpNEZ32, aa->Iex.Unop.arg);
+      break;
+   case Iop_Left32:
+      /* Left32( Left32(x) ) --> Left32(x) */
+      if (is_Unop(aa, Iop_Left32))
+         return IRExpr_Unop( Iop_Left32, aa->Iex.Unop.arg );
+      break;
+   case Iop_32to1:
+      /* 32to1( 1Uto32 ( x ) ) --> x */
+      if (is_Unop(aa, Iop_1Uto32))
+         return aa->Iex.Unop.arg;
+      /* 32to1( CmpwNEZ32 ( x )) --> CmpNEZ32(x) */
+      if (is_Unop(aa, Iop_CmpwNEZ32))
+         return IRExpr_Unop( Iop_CmpNEZ32, aa->Iex.Unop.arg );
+      break;
+   case Iop_64to1:
+      /* 64to1( 1Uto64 ( x ) ) --> x */
+      if (is_Unop(aa, Iop_1Uto64))
+         return aa->Iex.Unop.arg;
+      /* 64to1( CmpwNEZ64 ( x )) --> CmpNEZ64(x) */
+      if (is_Unop(aa, Iop_CmpwNEZ64))
+         return IRExpr_Unop( Iop_CmpNEZ64, aa->Iex.Unop.arg );
+      break;
+   case Iop_64to32:
+      /* 64to32( 32Uto64 ( x )) --> x */
+      if (is_Unop(aa, Iop_32Uto64))
+         return aa->Iex.Unop.arg;
+      /* 64to32( 8Uto64 ( x )) --> 8Uto32(x) */
+      if (is_Unop(aa, Iop_8Uto64))
+         return IRExpr_Unop(Iop_8Uto32, aa->Iex.Unop.arg);
+      break;
+
+   case Iop_32Uto64:
+      /* 32Uto64( 8Uto32( x )) --> 8Uto64(x) */
+      if (is_Unop(aa, Iop_8Uto32))
+         return IRExpr_Unop(Iop_8Uto64, aa->Iex.Unop.arg);
+      /* 32Uto64( 16Uto32( x )) --> 16Uto64(x) */
+      if (is_Unop(aa, Iop_16Uto32))
+         return IRExpr_Unop(Iop_16Uto64, aa->Iex.Unop.arg);
+      break;
+
+   case Iop_1Sto32:
+      /* 1Sto32( CmpNEZ8( 32to8( 1Uto32( CmpNEZ32( x ))))) -> CmpwNEZ32(x) */
+      if (is_Unop(aa, Iop_CmpNEZ8)
+          && is_Unop(aa->Iex.Unop.arg, Iop_32to8)
+          && is_Unop(aa->Iex.Unop.arg->Iex.Unop.arg, Iop_1Uto32)
+          && is_Unop(aa->Iex.Unop.arg->Iex.Unop.arg->Iex.Unop.arg,
+                     Iop_CmpNEZ32)) {
+         return IRExpr_Unop( Iop_CmpwNEZ32,
+                             aa->Iex.Unop.arg->Iex.Unop.arg
+                               ->Iex.Unop.arg->Iex.Unop.arg);
+      }
+      break;
+
+
+   default:
+      break;
+   }
+   /* no reduction rule applies */
+   return IRExpr_Unop( op, aa );
+}
+
+static IRExpr* atbSubst_Expr ( ATmpInfo* env, IRExpr* e )
+{
+   IRExpr*  e2;
+   IRExpr** args2;
+   Int      i;
+
+   switch (e->tag) {
+
+      case Iex_CCall:
+         args2 = shallowCopyIRExprVec(e->Iex.CCall.args);
+         for (i = 0; args2[i]; i++)
+            args2[i] = atbSubst_Expr(env,args2[i]);
+         return IRExpr_CCall(
+                   e->Iex.CCall.cee,
+                   e->Iex.CCall.retty,
+                   args2
+                );
+      case Iex_RdTmp:
+         e2 = atbSubst_Temp(env, e->Iex.RdTmp.tmp);
+         return e2 ? e2 : e;
+      case Iex_Mux0X:
+         return IRExpr_Mux0X(
+                   atbSubst_Expr(env, e->Iex.Mux0X.cond),
+                   atbSubst_Expr(env, e->Iex.Mux0X.expr0),
+                   atbSubst_Expr(env, e->Iex.Mux0X.exprX)
+                );
+      case Iex_Qop:
+         return IRExpr_Qop(
+                   e->Iex.Qop.op,
+                   atbSubst_Expr(env, e->Iex.Qop.arg1),
+                   atbSubst_Expr(env, e->Iex.Qop.arg2),
+                   atbSubst_Expr(env, e->Iex.Qop.arg3),
+                   atbSubst_Expr(env, e->Iex.Qop.arg4)
+                );
+      case Iex_Triop:
+         return IRExpr_Triop(
+                   e->Iex.Triop.op,
+                   atbSubst_Expr(env, e->Iex.Triop.arg1),
+                   atbSubst_Expr(env, e->Iex.Triop.arg2),
+                   atbSubst_Expr(env, e->Iex.Triop.arg3)
+                );
+      case Iex_Binop:
+         return fold_IRExpr_Binop(
+                   e->Iex.Binop.op,
+                   atbSubst_Expr(env, e->Iex.Binop.arg1),
+                   atbSubst_Expr(env, e->Iex.Binop.arg2)
+                );
+      case Iex_Unop:
+         return fold_IRExpr_Unop(
+                   e->Iex.Unop.op,
+                   atbSubst_Expr(env, e->Iex.Unop.arg)
+                );
+      case Iex_Load:
+         return IRExpr_Load(
+                   e->Iex.Load.end,
+                   e->Iex.Load.ty,
+                   atbSubst_Expr(env, e->Iex.Load.addr)
+                );
+      case Iex_GetI:
+         return IRExpr_GetI(
+                   e->Iex.GetI.descr,
+                   atbSubst_Expr(env, e->Iex.GetI.ix),
+                   e->Iex.GetI.bias
+                );
+      case Iex_Const:
+      case Iex_Get:
+         return e;
+      default: 
+         vex_printf("\n"); ppIRExpr(e); vex_printf("\n");
+         vpanic("atbSubst_Expr");
+   }
+}
+
+/* Same deal as atbSubst_Expr, except for stmts. */
+
+static IRStmt* atbSubst_Stmt ( ATmpInfo* env, IRStmt* st )
+{
+   Int     i;
+   IRDirty *d, *d2;
+   IRCAS   *cas, *cas2;
+   switch (st->tag) {
+      case Ist_AbiHint:
+         return IRStmt_AbiHint(
+                   atbSubst_Expr(env, st->Ist.AbiHint.base),
+                   st->Ist.AbiHint.len,
+                   atbSubst_Expr(env, st->Ist.AbiHint.nia)
+                );
+      case Ist_Store:
+         return IRStmt_Store(
+                   st->Ist.Store.end,
+                   atbSubst_Expr(env, st->Ist.Store.addr),
+                   atbSubst_Expr(env, st->Ist.Store.data)
+                );
+      case Ist_WrTmp:
+         return IRStmt_WrTmp(
+                   st->Ist.WrTmp.tmp,
+                   atbSubst_Expr(env, st->Ist.WrTmp.data)
+                );
+      case Ist_Put:
+         return IRStmt_Put(
+                   st->Ist.Put.offset,
+                   atbSubst_Expr(env, st->Ist.Put.data)
+                );
+      case Ist_PutI:
+         return IRStmt_PutI(
+                   st->Ist.PutI.descr,
+                   atbSubst_Expr(env, st->Ist.PutI.ix),
+                   st->Ist.PutI.bias,
+                   atbSubst_Expr(env, st->Ist.PutI.data)
+                );
+
+      case Ist_Exit:
+         return IRStmt_Exit(
+                   atbSubst_Expr(env, st->Ist.Exit.guard),
+                   st->Ist.Exit.jk,
+                   st->Ist.Exit.dst
+                );
+      case Ist_IMark:
+         return IRStmt_IMark(st->Ist.IMark.addr, st->Ist.IMark.len);
+      case Ist_NoOp:
+         return IRStmt_NoOp();
+      case Ist_MBE:
+         return IRStmt_MBE(st->Ist.MBE.event);
+      case Ist_CAS:
+         cas  = st->Ist.CAS.details;
+         cas2 = mkIRCAS(
+                   cas->oldHi, cas->oldLo, cas->end, 
+                   atbSubst_Expr(env, cas->addr),
+                   cas->expdHi ? atbSubst_Expr(env, cas->expdHi) : NULL,
+                   atbSubst_Expr(env, cas->expdLo),
+                   cas->dataHi ? atbSubst_Expr(env, cas->dataHi) : NULL,
+                   atbSubst_Expr(env, cas->dataLo)
+                );
+         return IRStmt_CAS(cas2);
+      case Ist_LLSC:
+         return IRStmt_LLSC(
+                   st->Ist.LLSC.end,
+                   st->Ist.LLSC.result,
+                   atbSubst_Expr(env, st->Ist.LLSC.addr),
+                   st->Ist.LLSC.storedata
+                      ? atbSubst_Expr(env, st->Ist.LLSC.storedata) : NULL
+                );
+      case Ist_Dirty:
+         d  = st->Ist.Dirty.details;
+         d2 = emptyIRDirty();
+         *d2 = *d;
+         if (d2->mFx != Ifx_None)
+            d2->mAddr = atbSubst_Expr(env, d2->mAddr);
+         d2->guard = atbSubst_Expr(env, d2->guard);
+         for (i = 0; d2->args[i]; i++)
+            d2->args[i] = atbSubst_Expr(env, d2->args[i]);
+         return IRStmt_Dirty(d2);
+      default: 
+         vex_printf("\n"); ppIRStmt(st); vex_printf("\n");
+         vpanic("atbSubst_Stmt");
+   }
+}
+
+/* notstatic */ void ado_treebuild_BB ( IRSB* bb )
+{
+   Int      i, j, k, m;
+   Bool     stmtPuts, stmtStores, invalidateMe;
+   IRStmt*  st;
+   IRStmt*  st2;
+   ATmpInfo env[A_NENV];
+
+   Int       n_tmps = bb->tyenv->types_used;
+   UShort*   uses   = LibVEX_Alloc(n_tmps * sizeof(UShort));
+
+   /* Phase 1.  Scan forwards in bb, counting use occurrences of each
+      temp.  Also count occurrences in the bb->next field. */
+
+   for (i = 0; i < n_tmps; i++)
+      uses[i] = 0;
+
+   for (i = 0; i < bb->stmts_used; i++) {
+      st = bb->stmts[i];
+      if (st->tag == Ist_NoOp)
+         continue;
+      aoccCount_Stmt( uses, st );
+   }
+   aoccCount_Expr(uses, bb->next );
+
+#  if 0
+   for (i = 0; i < n_tmps; i++) {
+      if (uses[i] == 0)
+        continue;
+      ppIRTemp( (IRTemp)i );
+      vex_printf("  used %d\n", (Int)uses[i] );
+   }
+#  endif
+
+   /* Phase 2.  Scan forwards in bb.  For each statement in turn:
+
+         If the env is full, emit the end element.  This guarantees
+         there is at least one free slot in the following.
+
+         On seeing 't = E', occ(t)==1,  
+            let E'=env(E)
+            delete this stmt
+            add t -> E' to the front of the env
+            Examine E' and set the hints for E' appropriately
+              (doesLoad? doesGet?)
+
+         On seeing any other stmt, 
+            let stmt' = env(stmt)
+            remove from env any 't=E' binds invalidated by stmt
+                emit the invalidated stmts
+            emit stmt'
+            compact any holes in env 
+              by sliding entries towards the front
+
+      Finally, apply env to bb->next.  
+   */
+
+   for (i = 0; i < A_NENV; i++) {
+      env[i].bindee = NULL;
+      env[i].binder = IRTemp_INVALID;
+   }
+
+   /* The stmts in bb are being reordered, and we are guaranteed to
+      end up with no more than the number we started with.  Use i to
+      be the cursor of the current stmt examined and j <= i to be that
+      for the current stmt being written. 
+   */
+   j = 0;
+   for (i = 0; i < bb->stmts_used; i++) {
+
+      st = bb->stmts[i];
+      if (st->tag == Ist_NoOp)
+         continue;
+     
+      /* Ensure there's at least one space in the env, by emitting
+         the oldest binding if necessary. */
+      if (env[A_NENV-1].bindee != NULL) {
+         bb->stmts[j] = IRStmt_WrTmp( env[A_NENV-1].binder, 
+                                      env[A_NENV-1].bindee );
+         j++;
+         vassert(j <= i);
+         env[A_NENV-1].bindee = NULL;
+      }
+
+      /* Consider current stmt. */
+      if (st->tag == Ist_WrTmp && uses[st->Ist.WrTmp.tmp] <= 1) {
+         IRExpr *e, *e2;
+
+         /* optional extra: dump dead bindings as we find them.
+            Removes the need for a prior dead-code removal pass. */
+         if (uses[st->Ist.WrTmp.tmp] == 0) {
+	    if (0) vex_printf("DEAD binding\n");
+            continue; /* for (i = 0; i < bb->stmts_used; i++) loop */
+         }
+         vassert(uses[st->Ist.WrTmp.tmp] == 1);
+
+         /* ok, we have 't = E', occ(t)==1.  Do the abovementioned
+            actions. */
+         e  = st->Ist.WrTmp.data;
+         e2 = atbSubst_Expr(env, e);
+         addToEnvFront(env, st->Ist.WrTmp.tmp, e2);
+         setHints_Expr(&env[0].doesLoad, &env[0].doesGet, e2);
+         /* don't advance j, as we are deleting this stmt and instead
+            holding it temporarily in the env. */
+         continue; /* for (i = 0; i < bb->stmts_used; i++) loop */
+      }
+
+      /* we get here for any other kind of statement. */
+      /* 'use up' any bindings required by the current statement. */
+      st2 = atbSubst_Stmt(env, st);
+
+      /* Now, before this stmt, dump any bindings in env that it
+         invalidates.  These need to be dumped in the order in which
+         they originally entered env -- that means from oldest to
+         youngest. */
+
+      /* stmtPuts/stmtStores characterise what the stmt under
+         consideration does, or might do (sidely safe @ True). */
+      stmtPuts
+         = toBool( st->tag == Ist_Put
+                   || st->tag == Ist_PutI 
+                   || st->tag == Ist_Dirty );
+
+      /* be True if this stmt writes memory or might do (==> we don't
+         want to reorder other loads or stores relative to it).  Also,
+         both LL and SC fall under this classification, since we
+         really ought to be conservative and not reorder any other
+         memory transactions relative to them. */
+      stmtStores
+         = toBool( st->tag == Ist_Store
+                   || st->tag == Ist_Dirty
+                   || st->tag == Ist_LLSC );
+
+      for (k = A_NENV-1; k >= 0; k--) {
+         if (env[k].bindee == NULL)
+            continue;
+         /* Compare the actions of this stmt with the actions of
+            binding 'k', to see if they invalidate the binding. */
+         invalidateMe
+            = toBool(
+              /* a store invalidates loaded data */
+              (env[k].doesLoad && stmtStores)
+              /* a put invalidates get'd data */
+              || (env[k].doesGet && stmtPuts)
+              /* a put invalidates loaded data.  Note, we could do
+                 much better here in the sense that we only need to
+                 invalidate trees containing loads if the Put in
+                 question is marked as requiring precise
+                 exceptions. */
+              || (env[k].doesLoad && stmtPuts)
+              /* probably overly conservative: a memory bus event
+                 invalidates absolutely everything, so that all
+                 computation prior to it is forced to complete before
+                 proceeding with the event (fence,lock,unlock). */
+              || st->tag == Ist_MBE
+              /* also be (probably overly) paranoid re AbiHints */
+              || st->tag == Ist_AbiHint
+              );
+         if (invalidateMe) {
+            bb->stmts[j] = IRStmt_WrTmp( env[k].binder, env[k].bindee );
+            j++;
+            vassert(j <= i);
+            env[k].bindee = NULL;
+         }
+      }
+
+      /* Slide in-use entries in env up to the front */
+      m = 0;
+      for (k = 0; k < A_NENV; k++) {
+         if (env[k].bindee != NULL) {
+            env[m] = env[k];
+            m++;
+	 }
+      }
+      for (m = m; m < A_NENV; m++) {
+         env[m].bindee = NULL;
+      }
+
+      /* finally, emit the substituted statement */
+      bb->stmts[j] = st2;
+      /* vex_printf("**2  "); ppIRStmt(bb->stmts[j]); vex_printf("\n"); */
+      j++;
+
+      vassert(j <= i+1);
+   } /* for each stmt in the original bb ... */
+
+   /* Finally ... substitute the ->next field as much as possible, and
+      dump any left-over bindings.  Hmm.  Perhaps there should be no
+      left over bindings?  Or any left-over bindings are
+      by definition dead? */
+   bb->next = atbSubst_Expr(env, bb->next);
+   bb->stmts_used = j;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- iropt main                                              ---*/
+/*---------------------------------------------------------------*/
+
+static Bool iropt_verbose = False; /* True; */
+
+
+/* Do a simple cleanup pass on bb.  This is: redundant Get removal,
+   redundant Put removal, constant propagation, dead code removal,
+   clean helper specialisation, and dead code removal (again).
+*/
+
+
+static 
+IRSB* cheap_transformations ( 
+         IRSB* bb,
+         IRExpr* (*specHelper) (HChar*, IRExpr**, IRStmt**, Int),
+         Bool (*preciseMemExnsFn)(Int,Int)
+      )
+{
+   redundant_get_removal_BB ( bb );
+   if (iropt_verbose) {
+      vex_printf("\n========= REDUNDANT GET\n\n" );
+      ppIRSB(bb);
+   }
+
+   redundant_put_removal_BB ( bb, preciseMemExnsFn );
+   if (iropt_verbose) {
+      vex_printf("\n========= REDUNDANT PUT\n\n" );
+      ppIRSB(bb);
+   }
+
+   bb = cprop_BB ( bb );
+   if (iropt_verbose) {
+      vex_printf("\n========= CPROPD\n\n" );
+      ppIRSB(bb);
+   }
+
+   do_deadcode_BB ( bb );
+   if (iropt_verbose) {
+      vex_printf("\n========= DEAD\n\n" );
+      ppIRSB(bb);
+   }
+
+   bb = spec_helpers_BB ( bb, specHelper );
+   do_deadcode_BB ( bb );
+   if (iropt_verbose) {
+      vex_printf("\n========= SPECd \n\n" );
+      ppIRSB(bb);
+   }
+
+   return bb;
+}
+
+
+/* Do some more expensive transformations on bb, which are aimed at
+   optimising as much as possible in the presence of GetI and PutI.  */
+
+static
+IRSB* expensive_transformations( IRSB* bb )
+{
+   (void)do_cse_BB( bb );
+   collapse_AddSub_chains_BB( bb );
+   do_redundant_GetI_elimination( bb );
+   do_redundant_PutI_elimination( bb );
+   do_deadcode_BB( bb );
+   return bb;
+}
+
+
+/* Scan a flattened BB to look for signs that more expensive
+   optimisations might be useful:
+   - find out if there are any GetIs and PutIs
+   - find out if there are any floating or vector-typed temporaries
+*/
+
+static void considerExpensives ( /*OUT*/Bool* hasGetIorPutI,
+                                 /*OUT*/Bool* hasVorFtemps,
+                                 IRSB* bb )
+{
+   Int      i, j;
+   IRStmt*  st;
+   IRDirty* d;
+   IRCAS*   cas;
+
+   *hasGetIorPutI = False;
+   *hasVorFtemps  = False;
+
+   for (i = 0; i < bb->stmts_used; i++) {
+      st = bb->stmts[i];
+      switch (st->tag) {
+         case Ist_AbiHint:
+            vassert(isIRAtom(st->Ist.AbiHint.base));
+            vassert(isIRAtom(st->Ist.AbiHint.nia));
+            break;
+         case Ist_PutI: 
+            *hasGetIorPutI = True;
+            break;
+         case Ist_WrTmp:  
+            if (st->Ist.WrTmp.data->tag == Iex_GetI)
+               *hasGetIorPutI = True;
+            switch (typeOfIRTemp(bb->tyenv, st->Ist.WrTmp.tmp)) {
+               case Ity_I1: case Ity_I8: case Ity_I16: 
+               case Ity_I32: case Ity_I64: case Ity_I128: 
+                  break;
+               case Ity_F32: case Ity_F64: case Ity_V128: 
+                  *hasVorFtemps = True;
+                  break;
+               default: 
+                  goto bad;
+            }
+            break;
+         case Ist_Put:
+            vassert(isIRAtom(st->Ist.Put.data));
+            break;
+         case Ist_Store:
+            vassert(isIRAtom(st->Ist.Store.addr));
+            vassert(isIRAtom(st->Ist.Store.data));
+            break;
+         case Ist_CAS:
+            cas = st->Ist.CAS.details;
+            vassert(isIRAtom(cas->addr));
+            vassert(cas->expdHi == NULL || isIRAtom(cas->expdHi));
+            vassert(isIRAtom(cas->expdLo));
+            vassert(cas->dataHi == NULL || isIRAtom(cas->dataHi));
+            vassert(isIRAtom(cas->dataLo));
+            break;
+         case Ist_LLSC:
+            vassert(isIRAtom(st->Ist.LLSC.addr));
+            if (st->Ist.LLSC.storedata)
+               vassert(isIRAtom(st->Ist.LLSC.storedata));
+            break;
+         case Ist_Dirty:
+            d = st->Ist.Dirty.details;
+            vassert(isIRAtom(d->guard));
+            for (j = 0; d->args[j]; j++)
+               vassert(isIRAtom(d->args[j]));
+            if (d->mFx != Ifx_None)
+               vassert(isIRAtom(d->mAddr));
+            break;
+         case Ist_NoOp:
+         case Ist_IMark:
+         case Ist_MBE:
+            break;
+         case Ist_Exit:
+            vassert(isIRAtom(st->Ist.Exit.guard));
+            break;
+         default: 
+         bad:
+            ppIRStmt(st);
+            vpanic("considerExpensives");
+      }
+   }
+}
+
+
+/* ---------------- The main iropt entry point. ---------------- */
+
+/* exported from this file */
+/* Rules of the game:
+
+   - IRExpr/IRStmt trees should be treated as immutable, as they
+     may get shared.  So never change a field of such a tree node;
+     instead construct and return a new one if needed.
+*/
+
+
+IRSB* do_iropt_BB(
+         IRSB* bb0,
+         IRExpr* (*specHelper) (HChar*, IRExpr**, IRStmt**, Int),
+         Bool (*preciseMemExnsFn)(Int,Int),
+         Addr64 guest_addr,
+         VexArch guest_arch
+      )
+{
+   static Int n_total     = 0;
+   static Int n_expensive = 0;
+
+   Bool hasGetIorPutI, hasVorFtemps;
+   IRSB *bb, *bb2;
+
+   n_total++;
+
+   /* First flatten the block out, since all other
+      phases assume flat code. */
+
+   bb = flatten_BB ( bb0 );
+
+   if (iropt_verbose) {
+      vex_printf("\n========= FLAT\n\n" );
+      ppIRSB(bb);
+   }
+
+   /* If at level 0, stop now. */
+   if (vex_control.iropt_level <= 0) return bb;
+
+   /* Now do a preliminary cleanup pass, and figure out if we also
+      need to do 'expensive' optimisations.  Expensive optimisations
+      are deemed necessary if the block contains any GetIs or PutIs.
+      If needed, do expensive transformations and then another cheap
+      cleanup pass. */
+
+   bb = cheap_transformations( bb, specHelper, preciseMemExnsFn );
+
+   if (guest_arch == VexArchARM) {
+      /* Translating Thumb2 code produces a lot of chaff.  We have to
+         work extra hard to get rid of it. */
+      bb = cprop_BB(bb);
+      bb = spec_helpers_BB ( bb, specHelper );
+      redundant_put_removal_BB ( bb, preciseMemExnsFn );
+      do_deadcode_BB( bb );
+   }
+
+   if (vex_control.iropt_level > 1) {
+
+      /* Peer at what we have, to decide how much more effort to throw
+         at it. */
+      considerExpensives( &hasGetIorPutI, &hasVorFtemps, bb );
+
+      if (hasVorFtemps && !hasGetIorPutI) {
+         /* If any evidence of FP or Vector activity, CSE, as that
+            tends to mop up all manner of lardy code to do with
+            rounding modes.  Don't bother if hasGetIorPutI since that
+            case leads into the expensive transformations, which do
+            CSE anyway. */
+         (void)do_cse_BB( bb );
+         do_deadcode_BB( bb );
+      }
+
+      if (hasGetIorPutI) {
+         Bool cses;
+         n_expensive++;
+         if (DEBUG_IROPT)
+            vex_printf("***** EXPENSIVE %d %d\n", n_total, n_expensive);
+         bb = expensive_transformations( bb );
+         bb = cheap_transformations( bb, specHelper, preciseMemExnsFn );
+         /* Potentially common up GetIs */
+         cses = do_cse_BB( bb );
+         if (cses)
+            bb = cheap_transformations( bb, specHelper, preciseMemExnsFn );
+      }
+
+      /* Now have a go at unrolling simple (single-BB) loops.  If
+         successful, clean up the results as much as possible. */
+
+      bb2 = maybe_loop_unroll_BB( bb, guest_addr );
+      if (bb2) {
+         bb = cheap_transformations( bb2, specHelper, preciseMemExnsFn );
+         if (hasGetIorPutI) {
+            bb = expensive_transformations( bb );
+            bb = cheap_transformations( bb, specHelper, preciseMemExnsFn );
+         } else {
+            /* at least do CSE and dead code removal */
+            do_cse_BB( bb );
+            do_deadcode_BB( bb );
+         }
+         if (0) vex_printf("vex iropt: unrolled a loop\n");
+      }
+
+   }
+
+   return bb;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                            ir_opt.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/ir_opt.h b/VEX/priv/ir_opt.h
new file mode 100644
index 0000000..ecdb146
--- /dev/null
+++ b/VEX/priv/ir_opt.h

@@ -0,0 +1,71 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                          ir_opt.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#ifndef __VEX_IR_OPT_H
+#define __VEX_IR_OPT_H
+
+#include "libvex_basictypes.h"
+#include "libvex_ir.h"
+#include "libvex.h"
+
+/* Top level optimiser entry point.  Returns a new BB.  Operates
+   under the control of the global "vex_control" struct. */
+extern 
+IRSB* do_iropt_BB(
+         IRSB* bb,
+         IRExpr* (*specHelper) (HChar*, IRExpr**, IRStmt**, Int),
+         Bool (*preciseMemExnsFn)(Int,Int),
+         Addr64 guest_addr,
+         VexArch guest_arch
+      );
+
+/* Do a constant folding/propagation pass. */
+extern
+IRSB* cprop_BB ( IRSB* );
+
+/* Do a dead-code removal pass.  bb is destructively modified. */
+extern
+void do_deadcode_BB ( IRSB* bb );
+
+/* The tree-builder.  Make (approximately) maximal safe trees.  bb is
+   destructively modified. */
+extern
+void ado_treebuild_BB ( IRSB* bb );
+
+#endif /* ndef __VEX_IR_OPT_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                            ir_opt.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/main_globals.c b/VEX/priv/main_globals.c
new file mode 100644
index 0000000..716fa75
--- /dev/null
+++ b/VEX/priv/main_globals.c

@@ -0,0 +1,71 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                    main_globals.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+
+#include "main_util.h"
+#include "main_globals.h"
+
+
+/* Global settings for the VEX library.  These are the
+   only library-wide globals. */
+
+/* Are we started yet? */
+Bool vex_initdone = False;
+
+/* failure exit function */
+__attribute__ ((noreturn))
+void (*vex_failure_exit) ( void ) = NULL;
+
+/* logging output function */
+void (*vex_log_bytes) ( HChar*, Int nbytes ) = NULL;
+
+/* debug paranoia level */
+Int vex_debuglevel = 0;
+
+/* trace flags */
+Int vex_traceflags = 0;
+
+/* Are we supporting valgrind checking? */
+Bool vex_valgrind_support = False;
+
+/* Max # guest insns per bb */
+VexControl vex_control = { 0,0,False,0,0,0 };
+
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                      main_globals.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/main_globals.h b/VEX/priv/main_globals.h
new file mode 100644
index 0000000..5b561a3
--- /dev/null
+++ b/VEX/priv/main_globals.h

@@ -0,0 +1,84 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                    main_globals.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#ifndef __VEX_MAIN_GLOBALS_H
+#define __VEX_MAIN_GLOBALS_H
+
+#include "libvex_basictypes.h"
+#include "libvex.h"
+
+
+/* Global settings for the VEX library.  These are the
+   only library-wide globals. */
+
+/* Are we started yet? */
+extern Bool vex_initdone;
+
+/* failure exit function */
+__attribute__ ((noreturn))
+extern void (*vex_failure_exit) ( void );
+
+/* logging output function */
+extern void (*vex_log_bytes) ( HChar*, Int nbytes );
+
+/* debug paranoia level */
+extern Int vex_debuglevel;
+
+/* trace flags */
+extern Int vex_traceflags;
+
+/* Are we supporting valgrind checking? */
+extern Bool vex_valgrind_support;
+
+/* Optimiser/front-end control */
+extern VexControl vex_control;
+
+
+/* vex_traceflags values */
+#define VEX_TRACE_FE     (1 << 7)  /* show conversion into IR */
+#define VEX_TRACE_OPT1   (1 << 6)  /* show after initial opt */
+#define VEX_TRACE_INST   (1 << 5)  /* show after instrumentation */
+#define VEX_TRACE_OPT2   (1 << 4)  /* show after second opt */
+#define VEX_TRACE_TREES  (1 << 3)  /* show after tree building */
+#define VEX_TRACE_VCODE  (1 << 2)  /* show selected insns */
+#define VEX_TRACE_RCODE  (1 << 1)  /* show after reg-alloc */
+#define VEX_TRACE_ASM    (1 << 0)  /* show final assembly */
+
+
+#endif /* ndef __VEX_MAIN_GLOBALS_H */
+
+/*---------------------------------------------------------------*/
+/*--- end                                      main_globals.h ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/main_main.c b/VEX/priv/main_main.c
new file mode 100644
index 0000000..1e80972
--- /dev/null
+++ b/VEX/priv/main_main.c

@@ -0,0 +1,911 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                       main_main.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex.h"
+#include "libvex_emwarn.h"
+#include "libvex_guest_x86.h"
+#include "libvex_guest_amd64.h"
+#include "libvex_guest_arm.h"
+#include "libvex_guest_ppc32.h"
+#include "libvex_guest_ppc64.h"
+
+#include "main_globals.h"
+#include "main_util.h"
+#include "host_generic_regs.h"
+#include "ir_opt.h"
+
+#include "host_x86_defs.h"
+#include "host_amd64_defs.h"
+#include "host_ppc_defs.h"
+#include "host_arm_defs.h"
+
+#include "guest_generic_bb_to_IR.h"
+#include "guest_x86_defs.h"
+#include "guest_amd64_defs.h"
+#include "guest_arm_defs.h"
+#include "guest_ppc_defs.h"
+
+#include "host_generic_simd128.h"
+
+
+/* This file contains the top level interface to the library. */
+
+/* --------- fwds ... --------- */
+
+static Bool   are_valid_hwcaps ( VexArch arch, UInt hwcaps );
+static HChar* show_hwcaps ( VexArch arch, UInt hwcaps );
+
+
+/* --------- Initialise the library. --------- */
+
+/* Exported to library client. */
+
+void LibVEX_default_VexControl ( /*OUT*/ VexControl* vcon )
+{
+   vcon->iropt_verbosity            = 0;
+   vcon->iropt_level                = 2;
+   vcon->iropt_precise_memory_exns  = False;
+   vcon->iropt_unroll_thresh        = 120;
+   vcon->guest_max_insns            = 60;
+   vcon->guest_chase_thresh         = 10;
+   vcon->guest_chase_cond           = False;
+}
+
+
+/* Exported to library client. */
+
+void LibVEX_Init (
+   /* failure exit function */
+   __attribute__ ((noreturn))
+   void (*failure_exit) ( void ),
+   /* logging output function */
+   void (*log_bytes) ( HChar*, Int nbytes ),
+   /* debug paranoia level */
+   Int debuglevel,
+   /* Are we supporting valgrind checking? */
+   Bool valgrind_support,
+   /* Control ... */
+   /*READONLY*/VexControl* vcon
+)
+{
+   /* First off, do enough minimal setup so that the following
+      assertions can fail in a sane fashion, if need be. */
+   vex_failure_exit = failure_exit;
+   vex_log_bytes    = log_bytes;
+
+   /* Now it's safe to check parameters for sanity. */
+   vassert(!vex_initdone);
+   vassert(failure_exit);
+   vassert(log_bytes);
+   vassert(debuglevel >= 0);
+
+   vassert(vcon->iropt_verbosity >= 0);
+   vassert(vcon->iropt_level >= 0);
+   vassert(vcon->iropt_level <= 2);
+   vassert(vcon->iropt_unroll_thresh >= 0);
+   vassert(vcon->iropt_unroll_thresh <= 400);
+   vassert(vcon->guest_max_insns >= 1);
+   vassert(vcon->guest_max_insns <= 100);
+   vassert(vcon->guest_chase_thresh >= 0);
+   vassert(vcon->guest_chase_thresh < vcon->guest_max_insns);
+   vassert(vcon->guest_chase_cond == True 
+           || vcon->guest_chase_cond == False);
+
+   /* Check that Vex has been built with sizes of basic types as
+      stated in priv/libvex_basictypes.h.  Failure of any of these is
+      a serious configuration error and should be corrected
+      immediately.  If any of these assertions fail you can fully
+      expect Vex not to work properly, if at all. */
+
+   vassert(1 == sizeof(UChar));
+   vassert(1 == sizeof(Char));
+   vassert(2 == sizeof(UShort));
+   vassert(2 == sizeof(Short));
+   vassert(4 == sizeof(UInt));
+   vassert(4 == sizeof(Int));
+   vassert(8 == sizeof(ULong));
+   vassert(8 == sizeof(Long));
+   vassert(4 == sizeof(Float));
+   vassert(8 == sizeof(Double));
+   vassert(1 == sizeof(Bool));
+   vassert(4 == sizeof(Addr32));
+   vassert(8 == sizeof(Addr64));
+   vassert(16 == sizeof(U128));
+   vassert(16 == sizeof(V128));
+
+   vassert(sizeof(void*) == 4 || sizeof(void*) == 8);
+   vassert(sizeof(void*) == sizeof(int*));
+   vassert(sizeof(void*) == sizeof(HWord));
+
+   vassert(VEX_HOST_WORDSIZE == sizeof(void*));
+   vassert(VEX_HOST_WORDSIZE == sizeof(HWord));
+
+   /* Really start up .. */
+   vex_debuglevel         = debuglevel;
+   vex_valgrind_support   = valgrind_support;
+   vex_control            = *vcon;
+   vex_initdone           = True;
+   vexSetAllocMode ( VexAllocModeTEMP );
+}
+
+
+/* --------- Make a translation. --------- */
+
+/* Exported to library client. */
+
+VexTranslateResult LibVEX_Translate ( VexTranslateArgs* vta )
+{
+   /* This the bundle of functions we need to do the back-end stuff
+      (insn selection, reg-alloc, assembly) whilst being insulated
+      from the target instruction set. */
+   HReg* available_real_regs;
+   Int   n_available_real_regs;
+   Bool         (*isMove)       ( HInstr*, HReg*, HReg* );
+   void         (*getRegUsage)  ( HRegUsage*, HInstr*, Bool );
+   void         (*mapRegs)      ( HRegRemap*, HInstr*, Bool );
+   void         (*genSpill)     ( HInstr**, HInstr**, HReg, Int, Bool );
+   void         (*genReload)    ( HInstr**, HInstr**, HReg, Int, Bool );
+   HInstr*      (*directReload) ( HInstr*, HReg, Short );
+   void         (*ppInstr)      ( HInstr*, Bool );
+   void         (*ppReg)        ( HReg );
+   HInstrArray* (*iselSB)       ( IRSB*, VexArch, VexArchInfo*, 
+                                                  VexAbiInfo* );
+   Int          (*emit)         ( UChar*, Int, HInstr*, Bool, void* );
+   IRExpr*      (*specHelper)   ( HChar*, IRExpr**, IRStmt**, Int );
+   Bool         (*preciseMemExnsFn) ( Int, Int );
+
+   DisOneInstrFn disInstrFn;
+
+   VexGuestLayout* guest_layout;
+   Bool            host_is_bigendian = False;
+   IRSB*           irsb;
+   HInstrArray*    vcode;
+   HInstrArray*    rcode;
+   Int             i, j, k, out_used, guest_sizeB;
+   Int             offB_TISTART, offB_TILEN;
+   UChar           insn_bytes[32];
+   IRType          guest_word_type;
+   IRType          host_word_type;
+   Bool            mode64;
+
+   guest_layout           = NULL;
+   available_real_regs    = NULL;
+   n_available_real_regs  = 0;
+   isMove                 = NULL;
+   getRegUsage            = NULL;
+   mapRegs                = NULL;
+   genSpill               = NULL;
+   genReload              = NULL;
+   directReload           = NULL;
+   ppInstr                = NULL;
+   ppReg                  = NULL;
+   iselSB                 = NULL;
+   emit                   = NULL;
+   specHelper             = NULL;
+   preciseMemExnsFn       = NULL;
+   disInstrFn             = NULL;
+   guest_word_type        = Ity_INVALID;
+   host_word_type         = Ity_INVALID;
+   offB_TISTART           = 0;
+   offB_TILEN             = 0;
+   mode64                 = False;
+
+   vex_traceflags = vta->traceflags;
+
+   vassert(vex_initdone);
+   vexSetAllocModeTEMP_and_clear();
+   vexAllocSanityCheck();
+
+   /* First off, check that the guest and host insn sets
+      are supported. */
+
+   switch (vta->arch_host) {
+
+      case VexArchX86:
+         mode64       = False;
+         getAllocableRegs_X86 ( &n_available_real_regs,
+                                &available_real_regs );
+         isMove       = (Bool(*)(HInstr*,HReg*,HReg*)) isMove_X86Instr;
+         getRegUsage  = (void(*)(HRegUsage*,HInstr*, Bool))
+                        getRegUsage_X86Instr;
+         mapRegs      = (void(*)(HRegRemap*,HInstr*, Bool)) mapRegs_X86Instr;
+         genSpill     = (void(*)(HInstr**,HInstr**,HReg,Int,Bool))
+                        genSpill_X86;
+         genReload    = (void(*)(HInstr**,HInstr**,HReg,Int,Bool))
+                        genReload_X86;
+         directReload = (HInstr*(*)(HInstr*,HReg,Short)) directReload_X86;
+         ppInstr      = (void(*)(HInstr*, Bool)) ppX86Instr;
+         ppReg        = (void(*)(HReg)) ppHRegX86;
+         iselSB       = iselSB_X86;
+         emit         = (Int(*)(UChar*,Int,HInstr*,Bool,void*)) emit_X86Instr;
+         host_is_bigendian = False;
+         host_word_type    = Ity_I32;
+         vassert(are_valid_hwcaps(VexArchX86, vta->archinfo_host.hwcaps));
+         vassert(vta->dispatch != NULL); /* jump-to-dispatcher scheme */
+         break;
+
+      case VexArchAMD64:
+         mode64      = True;
+         getAllocableRegs_AMD64 ( &n_available_real_regs,
+                                  &available_real_regs );
+         isMove      = (Bool(*)(HInstr*,HReg*,HReg*)) isMove_AMD64Instr;
+         getRegUsage = (void(*)(HRegUsage*,HInstr*, Bool))
+                       getRegUsage_AMD64Instr;
+         mapRegs     = (void(*)(HRegRemap*,HInstr*, Bool)) mapRegs_AMD64Instr;
+         genSpill    = (void(*)(HInstr**,HInstr**,HReg,Int,Bool))
+                       genSpill_AMD64;
+         genReload   = (void(*)(HInstr**,HInstr**,HReg,Int,Bool))
+                       genReload_AMD64;
+         ppInstr     = (void(*)(HInstr*, Bool)) ppAMD64Instr;
+         ppReg       = (void(*)(HReg)) ppHRegAMD64;
+         iselSB      = iselSB_AMD64;
+         emit        = (Int(*)(UChar*,Int,HInstr*,Bool,void*)) emit_AMD64Instr;
+         host_is_bigendian = False;
+         host_word_type    = Ity_I64;
+         vassert(are_valid_hwcaps(VexArchAMD64, vta->archinfo_host.hwcaps));
+         vassert(vta->dispatch != NULL); /* jump-to-dispatcher scheme */
+         break;
+
+      case VexArchPPC32:
+         mode64      = False;
+         getAllocableRegs_PPC ( &n_available_real_regs,
+                                &available_real_regs, mode64 );
+         isMove      = (Bool(*)(HInstr*,HReg*,HReg*)) isMove_PPCInstr;
+         getRegUsage = (void(*)(HRegUsage*,HInstr*,Bool)) getRegUsage_PPCInstr;
+         mapRegs     = (void(*)(HRegRemap*,HInstr*,Bool)) mapRegs_PPCInstr;
+         genSpill    = (void(*)(HInstr**,HInstr**,HReg,Int,Bool)) genSpill_PPC;
+         genReload   = (void(*)(HInstr**,HInstr**,HReg,Int,Bool)) genReload_PPC;
+         ppInstr     = (void(*)(HInstr*,Bool)) ppPPCInstr;
+         ppReg       = (void(*)(HReg)) ppHRegPPC;
+         iselSB      = iselSB_PPC;
+         emit        = (Int(*)(UChar*,Int,HInstr*,Bool,void*)) emit_PPCInstr;
+         host_is_bigendian = True;
+         host_word_type    = Ity_I32;
+         vassert(are_valid_hwcaps(VexArchPPC32, vta->archinfo_host.hwcaps));
+         vassert(vta->dispatch == NULL); /* return-to-dispatcher scheme */
+         break;
+
+      case VexArchPPC64:
+         mode64      = True;
+         getAllocableRegs_PPC ( &n_available_real_regs,
+                                &available_real_regs, mode64 );
+         isMove      = (Bool(*)(HInstr*,HReg*,HReg*)) isMove_PPCInstr;
+         getRegUsage = (void(*)(HRegUsage*,HInstr*, Bool)) getRegUsage_PPCInstr;
+         mapRegs     = (void(*)(HRegRemap*,HInstr*, Bool)) mapRegs_PPCInstr;
+         genSpill    = (void(*)(HInstr**,HInstr**,HReg,Int,Bool)) genSpill_PPC;
+         genReload   = (void(*)(HInstr**,HInstr**,HReg,Int,Bool)) genReload_PPC;
+         ppInstr     = (void(*)(HInstr*, Bool)) ppPPCInstr;
+         ppReg       = (void(*)(HReg)) ppHRegPPC;
+         iselSB      = iselSB_PPC;
+         emit        = (Int(*)(UChar*,Int,HInstr*,Bool,void*)) emit_PPCInstr;
+         host_is_bigendian = True;
+         host_word_type    = Ity_I64;
+         vassert(are_valid_hwcaps(VexArchPPC64, vta->archinfo_host.hwcaps));
+         vassert(vta->dispatch == NULL); /* return-to-dispatcher scheme */
+         break;
+
+      case VexArchARM:
+         mode64      = False;
+         getAllocableRegs_ARM ( &n_available_real_regs,
+                                &available_real_regs );
+         isMove      = (Bool(*)(HInstr*,HReg*,HReg*)) isMove_ARMInstr;
+         getRegUsage = (void(*)(HRegUsage*,HInstr*, Bool)) getRegUsage_ARMInstr;
+         mapRegs     = (void(*)(HRegRemap*,HInstr*, Bool)) mapRegs_ARMInstr;
+         genSpill    = (void(*)(HInstr**,HInstr**,HReg,Int,Bool)) genSpill_ARM;
+         genReload   = (void(*)(HInstr**,HInstr**,HReg,Int,Bool)) genReload_ARM;
+         ppInstr     = (void(*)(HInstr*, Bool)) ppARMInstr;
+         ppReg       = (void(*)(HReg)) ppHRegARM;
+         iselSB      = iselSB_ARM;
+         emit        = (Int(*)(UChar*,Int,HInstr*,Bool,void*)) emit_ARMInstr;
+         host_is_bigendian = False;
+         host_word_type    = Ity_I32;
+         vassert(are_valid_hwcaps(VexArchARM, vta->archinfo_host.hwcaps));
+         vassert(vta->dispatch == NULL); /* return-to-dispatcher scheme */
+         break;
+
+      default:
+         vpanic("LibVEX_Translate: unsupported host insn set");
+   }
+
+
+   switch (vta->arch_guest) {
+
+      case VexArchX86:
+         preciseMemExnsFn = guest_x86_state_requires_precise_mem_exns;
+         disInstrFn       = disInstr_X86;
+         specHelper       = guest_x86_spechelper;
+         guest_sizeB      = sizeof(VexGuestX86State);
+         guest_word_type  = Ity_I32;
+         guest_layout     = &x86guest_layout;
+         offB_TISTART     = offsetof(VexGuestX86State,guest_TISTART);
+         offB_TILEN       = offsetof(VexGuestX86State,guest_TILEN);
+         vassert(are_valid_hwcaps(VexArchX86, vta->archinfo_guest.hwcaps));
+         vassert(0 == sizeof(VexGuestX86State) % 16);
+         vassert(sizeof( ((VexGuestX86State*)0)->guest_TISTART) == 4);
+         vassert(sizeof( ((VexGuestX86State*)0)->guest_TILEN  ) == 4);
+         vassert(sizeof( ((VexGuestX86State*)0)->guest_NRADDR ) == 4);
+         break;
+
+      case VexArchAMD64:
+         preciseMemExnsFn = guest_amd64_state_requires_precise_mem_exns;
+         disInstrFn       = disInstr_AMD64;
+         specHelper       = guest_amd64_spechelper;
+         guest_sizeB      = sizeof(VexGuestAMD64State);
+         guest_word_type  = Ity_I64;
+         guest_layout     = &amd64guest_layout;
+         offB_TISTART     = offsetof(VexGuestAMD64State,guest_TISTART);
+         offB_TILEN       = offsetof(VexGuestAMD64State,guest_TILEN);
+         vassert(are_valid_hwcaps(VexArchAMD64, vta->archinfo_guest.hwcaps));
+         vassert(0 == sizeof(VexGuestAMD64State) % 16);
+         vassert(sizeof( ((VexGuestAMD64State*)0)->guest_TISTART ) == 8);
+         vassert(sizeof( ((VexGuestAMD64State*)0)->guest_TILEN   ) == 8);
+         vassert(sizeof( ((VexGuestAMD64State*)0)->guest_NRADDR  ) == 8);
+         break;
+
+      case VexArchPPC32:
+         preciseMemExnsFn = guest_ppc32_state_requires_precise_mem_exns;
+         disInstrFn       = disInstr_PPC;
+         specHelper       = guest_ppc32_spechelper;
+         guest_sizeB      = sizeof(VexGuestPPC32State);
+         guest_word_type  = Ity_I32;
+         guest_layout     = &ppc32Guest_layout;
+         offB_TISTART     = offsetof(VexGuestPPC32State,guest_TISTART);
+         offB_TILEN       = offsetof(VexGuestPPC32State,guest_TILEN);
+         vassert(are_valid_hwcaps(VexArchPPC32, vta->archinfo_guest.hwcaps));
+         vassert(0 == sizeof(VexGuestPPC32State) % 16);
+         vassert(sizeof( ((VexGuestPPC32State*)0)->guest_TISTART ) == 4);
+         vassert(sizeof( ((VexGuestPPC32State*)0)->guest_TILEN   ) == 4);
+         vassert(sizeof( ((VexGuestPPC32State*)0)->guest_NRADDR  ) == 4);
+         break;
+
+      case VexArchPPC64:
+         preciseMemExnsFn = guest_ppc64_state_requires_precise_mem_exns;
+         disInstrFn       = disInstr_PPC;
+         specHelper       = guest_ppc64_spechelper;
+         guest_sizeB      = sizeof(VexGuestPPC64State);
+         guest_word_type  = Ity_I64;
+         guest_layout     = &ppc64Guest_layout;
+         offB_TISTART     = offsetof(VexGuestPPC64State,guest_TISTART);
+         offB_TILEN       = offsetof(VexGuestPPC64State,guest_TILEN);
+         vassert(are_valid_hwcaps(VexArchPPC64, vta->archinfo_guest.hwcaps));
+         vassert(0 == sizeof(VexGuestPPC64State) % 16);
+         vassert(sizeof( ((VexGuestPPC64State*)0)->guest_TISTART    ) == 8);
+         vassert(sizeof( ((VexGuestPPC64State*)0)->guest_TILEN      ) == 8);
+         vassert(sizeof( ((VexGuestPPC64State*)0)->guest_NRADDR     ) == 8);
+         vassert(sizeof( ((VexGuestPPC64State*)0)->guest_NRADDR_GPR2) == 8);
+         break;
+
+      case VexArchARM:
+         preciseMemExnsFn = guest_arm_state_requires_precise_mem_exns;
+         disInstrFn       = disInstr_ARM;
+         specHelper       = guest_arm_spechelper;
+         guest_sizeB      = sizeof(VexGuestARMState);
+         guest_word_type  = Ity_I32;
+         guest_layout     = &armGuest_layout;
+         offB_TISTART     = offsetof(VexGuestARMState,guest_TISTART);
+         offB_TILEN       = offsetof(VexGuestARMState,guest_TILEN);
+         vassert(are_valid_hwcaps(VexArchARM, vta->archinfo_guest.hwcaps));
+         vassert(0 == sizeof(VexGuestARMState) % 16);
+         vassert(sizeof( ((VexGuestARMState*)0)->guest_TISTART) == 4);
+         vassert(sizeof( ((VexGuestARMState*)0)->guest_TILEN  ) == 4);
+         vassert(sizeof( ((VexGuestARMState*)0)->guest_NRADDR ) == 4);
+         break;
+
+      default:
+         vpanic("LibVEX_Translate: unsupported guest insn set");
+   }
+
+   /* yet more sanity checks ... */
+   if (vta->arch_guest == vta->arch_host) {
+      /* doesn't necessarily have to be true, but if it isn't it means
+         we are simulating one flavour of an architecture a different
+         flavour of the same architecture, which is pretty strange. */
+      vassert(vta->archinfo_guest.hwcaps == vta->archinfo_host.hwcaps);
+   }
+
+   vexAllocSanityCheck();
+
+   if (vex_traceflags & VEX_TRACE_FE)
+      vex_printf("\n------------------------" 
+                   " Front end "
+                   "------------------------\n\n");
+
+   irsb = bb_to_IR ( vta->guest_extents,
+                     vta->callback_opaque,
+                     disInstrFn,
+                     vta->guest_bytes, 
+                     vta->guest_bytes_addr,
+                     vta->chase_into_ok,
+                     host_is_bigendian,
+                     vta->arch_guest,
+                     &vta->archinfo_guest,
+                     &vta->abiinfo_both,
+                     guest_word_type,
+                     vta->do_self_check,
+                     vta->preamble_function,
+                     offB_TISTART,
+                     offB_TILEN );
+
+   vexAllocSanityCheck();
+
+   if (irsb == NULL) {
+      /* Access failure. */
+      vexSetAllocModeTEMP_and_clear();
+      vex_traceflags = 0;
+      return VexTransAccessFail;
+   }
+
+   vassert(vta->guest_extents->n_used >= 1 && vta->guest_extents->n_used <= 3);
+   vassert(vta->guest_extents->base[0] == vta->guest_bytes_addr);
+   for (i = 0; i < vta->guest_extents->n_used; i++) {
+      vassert(vta->guest_extents->len[i] < 10000); /* sanity */
+   }
+
+   /* If debugging, show the raw guest bytes for this bb. */
+   if (0 || (vex_traceflags & VEX_TRACE_FE)) {
+      if (vta->guest_extents->n_used > 1) {
+         vex_printf("can't show code due to extents > 1\n");
+      } else {
+         /* HACK */
+         UChar* p = (UChar*)vta->guest_bytes;
+         UInt   sum = 0;
+         UInt   guest_bytes_read = (UInt)vta->guest_extents->len[0];
+         vex_printf("GuestBytes %llx %u ", vta->guest_bytes_addr, 
+                                           guest_bytes_read );
+         for (i = 0; i < guest_bytes_read; i++) {
+            UInt b = (UInt)p[i];
+            vex_printf(" %02x", b );
+            sum = (sum << 1) ^ b;
+         }
+         vex_printf("  %08x\n\n", sum);
+      }
+   }
+
+   /* Sanity check the initial IR. */
+   sanityCheckIRSB( irsb, "initial IR", 
+                    False/*can be non-flat*/, guest_word_type );
+
+   vexAllocSanityCheck();
+
+   /* Clean it up, hopefully a lot. */
+   irsb = do_iropt_BB ( irsb, specHelper, preciseMemExnsFn, 
+                              vta->guest_bytes_addr,
+                              vta->arch_guest );
+   sanityCheckIRSB( irsb, "after initial iropt", 
+                    True/*must be flat*/, guest_word_type );
+
+   if (vex_traceflags & VEX_TRACE_OPT1) {
+      vex_printf("\n------------------------" 
+                   " After pre-instr IR optimisation "
+                   "------------------------\n\n");
+      ppIRSB ( irsb );
+      vex_printf("\n");
+   }
+
+   vexAllocSanityCheck();
+
+   /* Get the thing instrumented. */
+   if (vta->instrument1)
+      irsb = vta->instrument1(vta->callback_opaque,
+                              irsb, guest_layout, 
+                              vta->guest_extents,
+                              guest_word_type, host_word_type);
+   vexAllocSanityCheck();
+
+   if (vta->instrument2)
+      irsb = vta->instrument2(vta->callback_opaque,
+                              irsb, guest_layout,
+                              vta->guest_extents,
+                              guest_word_type, host_word_type);
+      
+   if (vex_traceflags & VEX_TRACE_INST) {
+      vex_printf("\n------------------------" 
+                   " After instrumentation "
+                   "------------------------\n\n");
+      ppIRSB ( irsb );
+      vex_printf("\n");
+   }
+
+   if (vta->instrument1 || vta->instrument2)
+      sanityCheckIRSB( irsb, "after instrumentation",
+                       True/*must be flat*/, guest_word_type );
+
+   /* Do a post-instrumentation cleanup pass. */
+   if (vta->instrument1 || vta->instrument2) {
+      do_deadcode_BB( irsb );
+      irsb = cprop_BB( irsb );
+      do_deadcode_BB( irsb );
+      sanityCheckIRSB( irsb, "after post-instrumentation cleanup",
+                       True/*must be flat*/, guest_word_type );
+   }
+
+   vexAllocSanityCheck();
+
+   if (vex_traceflags & VEX_TRACE_OPT2) {
+      vex_printf("\n------------------------" 
+                   " After post-instr IR optimisation "
+                   "------------------------\n\n");
+      ppIRSB ( irsb );
+      vex_printf("\n");
+   }
+
+   /* Turn it into virtual-registerised code.  Build trees -- this
+      also throws away any dead bindings. */
+   ado_treebuild_BB( irsb );
+
+   if (vta->finaltidy) {
+      irsb = vta->finaltidy(irsb);
+   }
+
+   vexAllocSanityCheck();
+
+   if (vex_traceflags & VEX_TRACE_TREES) {
+      vex_printf("\n------------------------" 
+                   "  After tree-building "
+                   "------------------------\n\n");
+      ppIRSB ( irsb );
+      vex_printf("\n");
+   }
+
+   /* HACK */
+   if (0) { *(vta->host_bytes_used) = 0; return VexTransOK; }
+   /* end HACK */
+
+   if (vex_traceflags & VEX_TRACE_VCODE)
+      vex_printf("\n------------------------" 
+                   " Instruction selection "
+                   "------------------------\n");
+
+   vcode = iselSB ( irsb, vta->arch_host, &vta->archinfo_host, 
+                                          &vta->abiinfo_both );
+
+   vexAllocSanityCheck();
+
+   if (vex_traceflags & VEX_TRACE_VCODE)
+      vex_printf("\n");
+
+   if (vex_traceflags & VEX_TRACE_VCODE) {
+      for (i = 0; i < vcode->arr_used; i++) {
+         vex_printf("%3d   ", i);
+         ppInstr(vcode->arr[i], mode64);
+         vex_printf("\n");
+      }
+      vex_printf("\n");
+   }
+
+   /* Register allocate. */
+   rcode = doRegisterAllocation ( vcode, available_real_regs,
+                                  n_available_real_regs,
+                                  isMove, getRegUsage, mapRegs, 
+                                  genSpill, genReload, directReload, 
+                                  guest_sizeB,
+                                  ppInstr, ppReg, mode64 );
+
+   vexAllocSanityCheck();
+
+   if (vex_traceflags & VEX_TRACE_RCODE) {
+      vex_printf("\n------------------------" 
+                   " Register-allocated code "
+                   "------------------------\n\n");
+      for (i = 0; i < rcode->arr_used; i++) {
+         vex_printf("%3d   ", i);
+         ppInstr(rcode->arr[i], mode64);
+         vex_printf("\n");
+      }
+      vex_printf("\n");
+   }
+
+   /* HACK */
+   if (0) { *(vta->host_bytes_used) = 0; return VexTransOK; }
+   /* end HACK */
+
+   /* Assemble */
+   if (vex_traceflags & VEX_TRACE_ASM) {
+      vex_printf("\n------------------------" 
+                   " Assembly "
+                   "------------------------\n\n");
+   }
+
+   out_used = 0; /* tracks along the host_bytes array */
+   for (i = 0; i < rcode->arr_used; i++) {
+      if (vex_traceflags & VEX_TRACE_ASM) {
+         ppInstr(rcode->arr[i], mode64);
+         vex_printf("\n");
+      }
+      j = (*emit)( insn_bytes, 32, rcode->arr[i], mode64, vta->dispatch );
+      if (vex_traceflags & VEX_TRACE_ASM) {
+         for (k = 0; k < j; k++)
+            if (insn_bytes[k] < 16)
+               vex_printf("0%x ",  (UInt)insn_bytes[k]);
+            else
+               vex_printf("%x ", (UInt)insn_bytes[k]);
+         vex_printf("\n\n");
+      }
+      if (out_used + j > vta->host_bytes_size) {
+         vexSetAllocModeTEMP_and_clear();
+         vex_traceflags = 0;
+         return VexTransOutputFull;
+      }
+      for (k = 0; k < j; k++) {
+         vta->host_bytes[out_used] = insn_bytes[k];
+         out_used++;
+      }
+      vassert(out_used <= vta->host_bytes_size);
+   }
+   *(vta->host_bytes_used) = out_used;
+
+   vexAllocSanityCheck();
+
+   vexSetAllocModeTEMP_and_clear();
+
+   vex_traceflags = 0;
+   return VexTransOK;
+}
+
+
+/* --------- Emulation warnings. --------- */
+
+HChar* LibVEX_EmWarn_string ( VexEmWarn ew )
+{
+   switch (ew) {
+     case EmWarn_NONE: 
+        return "none";
+     case EmWarn_X86_x87exns:
+        return "Unmasking x87 FP exceptions";
+     case EmWarn_X86_x87precision:
+        return "Selection of non-80-bit x87 FP precision";
+     case EmWarn_X86_sseExns:
+        return "Unmasking SSE FP exceptions";
+     case EmWarn_X86_fz:
+        return "Setting %mxcsr.fz (SSE flush-underflows-to-zero mode)";
+     case EmWarn_X86_daz:
+        return "Setting %mxcsr.daz (SSE treat-denormals-as-zero mode)";
+     case EmWarn_X86_acFlag:
+        return "Setting %eflags.ac (setting noted but ignored)";
+     case EmWarn_PPCexns:
+        return "Unmasking PPC32/64 FP exceptions";
+     case EmWarn_PPC64_redir_overflow:
+        return "PPC64 function redirection stack overflow";
+     case EmWarn_PPC64_redir_underflow:
+        return "PPC64 function redirection stack underflow";
+     default: 
+        vpanic("LibVEX_EmWarn_string: unknown warning");
+   }
+}
+
+/* ------------------ Arch/HwCaps stuff. ------------------ */
+
+const HChar* LibVEX_ppVexArch ( VexArch arch )
+{
+   switch (arch) {
+      case VexArch_INVALID: return "INVALID";
+      case VexArchX86:      return "X86";
+      case VexArchAMD64:    return "AMD64";
+      case VexArchARM:      return "ARM";
+      case VexArchPPC32:    return "PPC32";
+      case VexArchPPC64:    return "PPC64";
+      default:              return "VexArch???";
+   }
+}
+
+const HChar* LibVEX_ppVexHwCaps ( VexArch arch, UInt hwcaps )
+{
+   HChar* str = show_hwcaps(arch,hwcaps);
+   return str ? str : "INVALID";
+}
+
+
+/* Write default settings info *vai. */
+void LibVEX_default_VexArchInfo ( /*OUT*/VexArchInfo* vai )
+{
+   vai->hwcaps             = 0;
+   vai->ppc_cache_line_szB = 0;
+   vai->ppc_dcbz_szB       = 0;
+   vai->ppc_dcbzl_szB      = 0;
+
+}
+
+/* Write default settings info *vbi. */
+void LibVEX_default_VexAbiInfo ( /*OUT*/VexAbiInfo* vbi )
+{
+   vbi->guest_stack_redzone_size       = 0;
+   vbi->guest_amd64_assume_fs_is_zero  = False;
+   vbi->guest_amd64_assume_gs_is_0x60  = False;
+   vbi->guest_ppc_zap_RZ_at_blr        = False;
+   vbi->guest_ppc_zap_RZ_at_bl         = NULL;
+   vbi->guest_ppc_sc_continues_at_LR   = False;
+   vbi->host_ppc_calls_use_fndescrs    = False;
+   vbi->host_ppc32_regalign_int64_args = False;
+}
+
+
+/* Return a string showing the hwcaps in a nice way.  The string will
+   be NULL for invalid combinations of flags, so these functions also
+   serve as a way to validate hwcaps values. */
+
+static HChar* show_hwcaps_x86 ( UInt hwcaps ) 
+{
+   /* Monotonic, SSE3 > SSE2 > SSE1 > baseline. */
+   switch (hwcaps) {
+      case 0:
+         return "x86-sse0";
+      case VEX_HWCAPS_X86_SSE1:
+         return "x86-sse1";
+      case VEX_HWCAPS_X86_SSE1 | VEX_HWCAPS_X86_SSE2:
+         return "x86-sse1-sse2";
+      case VEX_HWCAPS_X86_SSE1 | VEX_HWCAPS_X86_SSE2
+           | VEX_HWCAPS_X86_LZCNT:
+         return "x86-sse1-sse2-lzcnt";
+      case VEX_HWCAPS_X86_SSE1 | VEX_HWCAPS_X86_SSE2
+           | VEX_HWCAPS_X86_SSE3:
+         return "x86-sse1-sse2-sse3";
+      case VEX_HWCAPS_X86_SSE1 | VEX_HWCAPS_X86_SSE2
+           | VEX_HWCAPS_X86_SSE3 | VEX_HWCAPS_X86_LZCNT:
+         return "x86-sse1-sse2-sse3-lzcnt";
+      default:
+         return NULL;
+   }
+}
+
+static HChar* show_hwcaps_amd64 ( UInt hwcaps )
+{
+   /* SSE3 and CX16 are orthogonal and > baseline, although we really
+      don't expect to come across anything which can do SSE3 but can't
+      do CX16.  Still, we can handle that case.  LZCNT is similarly
+      orthogonal. */
+   switch (hwcaps) {
+      case 0:
+         return "amd64-sse2";
+      case VEX_HWCAPS_AMD64_SSE3:
+         return "amd64-sse3";
+      case VEX_HWCAPS_AMD64_CX16:
+         return "amd64-sse2-cx16";
+      case VEX_HWCAPS_AMD64_SSE3 | VEX_HWCAPS_AMD64_CX16:
+         return "amd64-sse3-cx16";
+      case VEX_HWCAPS_AMD64_SSE3 | VEX_HWCAPS_AMD64_LZCNT:
+         return "amd64-sse3-lzcnt";
+      case VEX_HWCAPS_AMD64_CX16 | VEX_HWCAPS_AMD64_LZCNT:
+         return "amd64-sse2-cx16-lzcnt";
+      case VEX_HWCAPS_AMD64_SSE3 | VEX_HWCAPS_AMD64_CX16
+           | VEX_HWCAPS_AMD64_LZCNT:
+         return "amd64-sse3-cx16-lzcnt";
+
+      default:
+         return NULL;
+   }
+}
+
+static HChar* show_hwcaps_ppc32 ( UInt hwcaps )
+{
+   /* Monotonic with complications.  Basically V > F > baseline,
+      but once you have F then you can have FX or GX too. */
+   const UInt F  = VEX_HWCAPS_PPC32_F;
+   const UInt V  = VEX_HWCAPS_PPC32_V;
+   const UInt FX = VEX_HWCAPS_PPC32_FX;
+   const UInt GX = VEX_HWCAPS_PPC32_GX;
+         UInt c  = hwcaps;
+   if (c == 0)           return "ppc32-int";
+   if (c == F)           return "ppc32-int-flt";
+   if (c == (F|FX))      return "ppc32-int-flt-FX";
+   if (c == (F|GX))      return "ppc32-int-flt-GX";
+   if (c == (F|FX|GX))   return "ppc32-int-flt-FX-GX";
+   if (c == (F|V))       return "ppc32-int-flt-vmx";
+   if (c == (F|V|FX))    return "ppc32-int-flt-vmx-FX";
+   if (c == (F|V|GX))    return "ppc32-int-flt-vmx-GX";
+   if (c == (F|V|FX|GX)) return "ppc32-int-flt-vmx-FX-GX";
+   return NULL;
+}
+
+static HChar* show_hwcaps_ppc64 ( UInt hwcaps )
+{
+   /* Monotonic with complications.  Basically V > baseline(==F),
+      but once you have F then you can have FX or GX too. */
+   const UInt V  = VEX_HWCAPS_PPC64_V;
+   const UInt FX = VEX_HWCAPS_PPC64_FX;
+   const UInt GX = VEX_HWCAPS_PPC64_GX;
+         UInt c  = hwcaps;
+   if (c == 0)         return "ppc64-int-flt";
+   if (c == FX)        return "ppc64-int-flt-FX";
+   if (c == GX)        return "ppc64-int-flt-GX";
+   if (c == (FX|GX))   return "ppc64-int-flt-FX-GX";
+   if (c == V)         return "ppc64-int-flt-vmx";
+   if (c == (V|FX))    return "ppc64-int-flt-vmx-FX";
+   if (c == (V|GX))    return "ppc64-int-flt-vmx-GX";
+   if (c == (V|FX|GX)) return "ppc64-int-flt-vmx-FX-GX";
+   return NULL;
+}
+
+static HChar* show_hwcaps_arm ( UInt hwcaps )
+{
+   Bool N = ((hwcaps & VEX_HWCAPS_ARM_NEON) != 0);
+   Bool vfp = ((hwcaps & (VEX_HWCAPS_ARM_VFP |
+               VEX_HWCAPS_ARM_VFP2 | VEX_HWCAPS_ARM_VFP3)) != 0);
+   switch (VEX_ARM_ARCHLEVEL(hwcaps)) {
+      case 5:
+         if (N)
+            return NULL;
+         if (vfp)
+            return "ARMv5-vfp";
+         else
+            return "ARMv5";
+         return NULL;
+      case 6:
+         if (N)
+            return NULL;
+         if (vfp)
+            return "ARMv6-vfp";
+         else
+            return "ARMv6";
+         return NULL;
+      case 7:
+         if (vfp) {
+            if (N)
+               return "ARMv7-vfp-neon";
+            else
+               return "ARMv7-vfp";
+         } else {
+            if (N)
+               return "ARMv7-neon";
+            else
+               return "ARMv7";
+         }
+      default:
+         return NULL;
+   }
+   return NULL;
+}
+
+/* ---- */
+static HChar* show_hwcaps ( VexArch arch, UInt hwcaps )
+{
+   switch (arch) {
+      case VexArchX86:   return show_hwcaps_x86(hwcaps);
+      case VexArchAMD64: return show_hwcaps_amd64(hwcaps);
+      case VexArchPPC32: return show_hwcaps_ppc32(hwcaps);
+      case VexArchPPC64: return show_hwcaps_ppc64(hwcaps);
+      case VexArchARM:   return show_hwcaps_arm(hwcaps);
+      default: return NULL;
+   }
+}
+
+static Bool are_valid_hwcaps ( VexArch arch, UInt hwcaps )
+{
+   return show_hwcaps(arch,hwcaps) != NULL;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                         main_main.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/main_util.c b/VEX/priv/main_util.c
new file mode 100644
index 0000000..d12380e
--- /dev/null
+++ b/VEX/priv/main_util.c

@@ -0,0 +1,538 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                       main_util.c ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#include "libvex_basictypes.h"
+#include "libvex.h"
+
+#include "main_globals.h"
+#include "main_util.h"
+
+
+/*---------------------------------------------------------*/
+/*--- Storage                                           ---*/
+/*---------------------------------------------------------*/
+
+/* Try to keep this as low as possible -- in particular, less than the
+   size of the smallest L2 cache we might encounter.  At 50000, my VIA
+   Nehemiah 1 GHz (a weedy machine) can satisfy 27 million calls/
+   second to LibVEX_Alloc(16) -- that is, allocate memory at over 400
+   MByte/sec.  Once the size increases enough to fall out of the cache
+   into memory, the rate falls by about a factor of 3. 
+*/
+#define N_TEMPORARY_BYTES 5000000
+
+static HChar  temporary[N_TEMPORARY_BYTES] __attribute__((aligned(8)));
+static HChar* temporary_first = &temporary[0];
+static HChar* temporary_curr  = &temporary[0];
+static HChar* temporary_last  = &temporary[N_TEMPORARY_BYTES-1];
+
+static ULong  temporary_bytes_allocd_TOT = 0;
+
+#define N_PERMANENT_BYTES 10000
+
+static HChar  permanent[N_PERMANENT_BYTES] __attribute__((aligned(8)));
+static HChar* permanent_first = &permanent[0];
+static HChar* permanent_curr  = &permanent[0];
+static HChar* permanent_last  = &permanent[N_PERMANENT_BYTES-1];
+
+static VexAllocMode mode = VexAllocModeTEMP;
+
+void vexAllocSanityCheck ( void )
+{
+   vassert(temporary_first == &temporary[0]);
+   vassert(temporary_last  == &temporary[N_TEMPORARY_BYTES-1]);
+   vassert(permanent_first == &permanent[0]);
+   vassert(permanent_last  == &permanent[N_PERMANENT_BYTES-1]);
+   vassert(temporary_first <= temporary_curr);
+   vassert(temporary_curr  <= temporary_last);
+   vassert(permanent_first <= permanent_curr);
+   vassert(permanent_curr  <= permanent_last);
+   vassert(private_LibVEX_alloc_first <= private_LibVEX_alloc_curr);
+   vassert(private_LibVEX_alloc_curr  <= private_LibVEX_alloc_last);
+   if (mode == VexAllocModeTEMP){
+      vassert(private_LibVEX_alloc_first == temporary_first);
+      vassert(private_LibVEX_alloc_last  == temporary_last);
+   } 
+   else
+   if (mode == VexAllocModePERM) {
+      vassert(private_LibVEX_alloc_first == permanent_first);
+      vassert(private_LibVEX_alloc_last  == permanent_last);
+   }
+   else 
+      vassert(0);
+
+#  define IS_WORD_ALIGNED(p)   (0 == (((HWord)p) & (sizeof(HWord)-1)))
+   vassert(sizeof(HWord) == 4 || sizeof(HWord) == 8);
+   vassert(IS_WORD_ALIGNED(temporary_first));
+   vassert(IS_WORD_ALIGNED(temporary_curr));
+   vassert(IS_WORD_ALIGNED(temporary_last+1));
+   vassert(IS_WORD_ALIGNED(permanent_first));
+   vassert(IS_WORD_ALIGNED(permanent_curr));
+   vassert(IS_WORD_ALIGNED(permanent_last+1));
+   vassert(IS_WORD_ALIGNED(private_LibVEX_alloc_first));
+   vassert(IS_WORD_ALIGNED(private_LibVEX_alloc_curr));
+   vassert(IS_WORD_ALIGNED(private_LibVEX_alloc_last+1));
+#  undef IS_WORD_ALIGNED
+}
+
+/* The current allocation mode. */
+
+void vexSetAllocMode ( VexAllocMode m )
+{
+   vexAllocSanityCheck();
+
+   /* Save away the current allocation point .. */
+   if (mode == VexAllocModeTEMP){
+      temporary_curr = private_LibVEX_alloc_curr;
+   } 
+   else
+   if (mode == VexAllocModePERM) {
+      permanent_curr = private_LibVEX_alloc_curr;
+   }
+   else 
+      vassert(0);
+
+   /* Did that screw anything up? */
+   vexAllocSanityCheck();
+
+   if (m == VexAllocModeTEMP){
+      private_LibVEX_alloc_first = temporary_first;
+      private_LibVEX_alloc_curr  = temporary_curr;
+      private_LibVEX_alloc_last  = temporary_last;
+   } 
+   else
+   if (m == VexAllocModePERM) {
+      private_LibVEX_alloc_first = permanent_first;
+      private_LibVEX_alloc_curr  = permanent_curr;
+      private_LibVEX_alloc_last  = permanent_last;
+   }
+   else 
+      vassert(0);
+
+   mode = m;
+}
+
+VexAllocMode vexGetAllocMode ( void )
+{
+   return mode;
+}
+
+/* Visible to library client, unfortunately. */
+
+HChar* private_LibVEX_alloc_first = &temporary[0];
+HChar* private_LibVEX_alloc_curr  = &temporary[0];
+HChar* private_LibVEX_alloc_last  = &temporary[N_TEMPORARY_BYTES-1];
+
+__attribute__((noreturn))
+void private_LibVEX_alloc_OOM(void)
+{
+   HChar* pool = "???";
+   if (private_LibVEX_alloc_first == &temporary[0]) pool = "TEMP";
+   if (private_LibVEX_alloc_first == &permanent[0]) pool = "PERM";
+   vex_printf("VEX temporary storage exhausted.\n");
+   vex_printf("Pool = %s,  start %p curr %p end %p (size %lld)\n",
+              pool, 
+              private_LibVEX_alloc_first,
+              private_LibVEX_alloc_curr,
+              private_LibVEX_alloc_last,
+              (Long)(private_LibVEX_alloc_last + 1 - private_LibVEX_alloc_first));
+   vpanic("VEX temporary storage exhausted.\n"
+          "Increase N_{TEMPORARY,PERMANENT}_BYTES and recompile.");
+}
+
+void vexSetAllocModeTEMP_and_clear ( void )
+{
+   /* vassert(vex_initdone); */ /* causes infinite assert loops */
+   temporary_bytes_allocd_TOT 
+      += (ULong)(private_LibVEX_alloc_curr - private_LibVEX_alloc_first);
+
+   mode = VexAllocModeTEMP;
+   temporary_curr            = &temporary[0];
+   private_LibVEX_alloc_curr = &temporary[0];
+
+   /* Set to (1) and change the fill byte to 0x00 or 0xFF to test for
+      any potential bugs due to using uninitialised memory in the main
+      VEX storage area. */
+   if (0) {
+      Int i;
+      for (i = 0; i < N_TEMPORARY_BYTES; i++)
+         temporary[i] = 0x00;
+   }
+
+   vexAllocSanityCheck();
+}
+
+
+/* Exported to library client. */
+
+void LibVEX_ShowAllocStats ( void )
+{
+   vex_printf("vex storage: T total %lld bytes allocated\n",
+              (Long)temporary_bytes_allocd_TOT );
+   vex_printf("vex storage: P total %lld bytes allocated\n",
+              (Long)(permanent_curr - permanent_first) );
+}
+
+
+/*---------------------------------------------------------*/
+/*--- Bombing out                                       ---*/
+/*---------------------------------------------------------*/
+
+__attribute__ ((noreturn))
+void vex_assert_fail ( const HChar* expr,
+                       const HChar* file, Int line, const HChar* fn )
+{
+   vex_printf( "\nvex: %s:%d (%s): Assertion `%s' failed.\n",
+               file, line, fn, expr );
+   (*vex_failure_exit)();
+}
+
+__attribute__ ((noreturn))
+void vpanic ( HChar* str )
+{
+   vex_printf("\nvex: the `impossible' happened:\n   %s\n", str);
+   (*vex_failure_exit)();
+}
+
+
+/*---------------------------------------------------------*/
+/*--- vex_printf                                        ---*/
+/*---------------------------------------------------------*/
+
+/* This should be the only <...> include in the entire VEX library.
+   New code for vex_util.c should go above this point. */
+#include <stdarg.h>
+
+static Int vex_strlen ( const HChar* str )
+{
+   Int i = 0;
+   while (str[i] != 0) i++;
+   return i;
+}
+
+Bool vex_streq ( const HChar* s1, const HChar* s2 )
+{
+   while (True) {
+      if (*s1 == 0 && *s2 == 0)
+         return True;
+      if (*s1 != *s2)
+         return False;
+      s1++;
+      s2++;
+   }
+}
+
+
+/* Convert N0 into ascii in BUF, which is assumed to be big enough (at
+   least 67 bytes long).  Observe BASE, SYNED and HEXCAPS. */
+static
+void convert_int ( /*OUT*/HChar* buf, Long n0, 
+                   Int base, Bool syned, Bool hexcaps )
+{
+   ULong u0;
+   HChar c;
+   Bool minus = False;
+   Int i, j, bufi = 0;
+   buf[bufi] = 0;
+
+   if (syned) {
+      if (n0 < 0) {
+         minus = True;
+         u0 = (ULong)(-n0);
+      } else {
+         u0 = (ULong)(n0);
+      }
+   } else {
+      u0 = (ULong)n0;
+   }
+
+   while (1) {
+     buf[bufi++] = toHChar('0' + toUInt(u0 % base));
+     u0 /= base;
+     if (u0 == 0) break;
+   }
+   if (minus)
+      buf[bufi++] = '-';
+
+   buf[bufi] = 0;
+   for (i = 0; i < bufi; i++)
+      if (buf[i] > '9') 
+         buf[i] = toHChar(buf[i] + (hexcaps ? 'A' : 'a') - '9' - 1);
+
+   i = 0;
+   j = bufi-1;
+   while (i <= j) {
+      c = buf[i];
+      buf[i] = buf[j];
+      buf[j] = c;
+      i++;
+      j--;
+   }
+}
+
+
+/* A half-arsed and buggy, but good-enough, implementation of
+   printf. */
+static
+UInt vprintf_wrk ( void(*sink)(HChar),
+                   HChar* format,
+                   va_list ap )
+{
+#  define PUT(_ch)  \
+      do { sink(_ch); nout++; } \
+      while (0)
+
+#  define PAD(_n) \
+      do { Int _qq = (_n); for (; _qq > 0; _qq--) PUT(padchar); } \
+      while (0)
+
+#  define PUTSTR(_str) \
+      do { HChar* _qq = _str; for (; *_qq; _qq++) PUT(*_qq); } \
+      while (0)
+
+   HChar* saved_format;
+   Bool   longlong, ljustify;
+   HChar  padchar;
+   Int    fwidth, nout, len1, len2, len3;
+   HChar  intbuf[100];  /* big enough for a 64-bit # in base 2 */
+
+   nout = 0;
+   while (1) {
+
+      if (!format)
+         break;
+      if (*format == 0) 
+         break;
+
+      if (*format != '%') {
+         PUT(*format); 
+         format++;
+         continue;
+      }
+
+      saved_format = format;
+      longlong = False;
+      ljustify = False;
+      padchar = ' ';
+      fwidth = 0;
+      format++;
+
+      if (*format == '-') {
+         format++;
+         ljustify = True;
+      }
+      if (*format == '0') {
+         format++;
+         padchar = '0';
+      }
+      while (*format >= '0' && *format <= '9') {
+         fwidth = fwidth * 10 + (*format - '0');
+         format++;
+      }
+      if (*format == 'l') {
+         format++;
+         if (*format == 'l') {
+            format++;
+           longlong = True;
+         }
+      }
+
+      switch (*format) {
+         case 's': {
+            HChar* str = va_arg(ap, HChar*);
+            if (str == NULL)
+               str = "(null)";
+            len1 = len3 = 0;
+            len2 = vex_strlen(str);
+            if (fwidth > len2) { len1 = ljustify ? 0 : fwidth-len2;
+                                 len3 = ljustify ? fwidth-len2 : 0; }
+            PAD(len1); PUTSTR(str); PAD(len3);
+            break;
+         }
+         case 'c': {
+            HChar c = (HChar)va_arg(ap, int);
+            HChar str[2];
+            str[0] = c;
+            str[1] = 0;
+            len1 = len3 = 0;
+            len2 = vex_strlen(str);
+            if (fwidth > len2) { len1 = ljustify ? 0 : fwidth-len2;
+                                 len3 = ljustify ? fwidth-len2 : 0; }
+            PAD(len1); PUTSTR(str); PAD(len3);
+            break;
+         }
+         case 'd': {
+            Long l;
+            if (longlong) {
+               l = va_arg(ap, Long);
+            } else {
+               l = (Long)va_arg(ap, Int);
+            }
+            convert_int(intbuf, l, 10/*base*/, True/*signed*/,
+                                False/*irrelevant*/);
+            len1 = len3 = 0;
+            len2 = vex_strlen(intbuf);
+            if (fwidth > len2) { len1 = ljustify ? 0 : fwidth-len2;
+                                 len3 = ljustify ? fwidth-len2 : 0; }
+            PAD(len1); PUTSTR(intbuf); PAD(len3);
+            break;
+         }
+         case 'u': 
+         case 'x': 
+         case 'X': {
+            Int   base = *format == 'u' ? 10 : 16;
+            Bool  hexcaps = True; /* *format == 'X'; */
+            ULong l;
+            if (longlong) {
+               l = va_arg(ap, ULong);
+            } else {
+               l = (ULong)va_arg(ap, UInt);
+            }
+            convert_int(intbuf, l, base, False/*unsigned*/, hexcaps);
+            len1 = len3 = 0;
+            len2 = vex_strlen(intbuf);
+            if (fwidth > len2) { len1 = ljustify ? 0 : fwidth-len2;
+                                 len3 = ljustify ? fwidth-len2 : 0; }
+            PAD(len1); PUTSTR(intbuf); PAD(len3);
+            break;
+         }
+         case 'p': 
+         case 'P': {
+            Bool hexcaps = toBool(*format == 'P');
+            ULong l = Ptr_to_ULong( va_arg(ap, void*) );
+            convert_int(intbuf, l, 16/*base*/, False/*unsigned*/, hexcaps);
+            len1 = len3 = 0;
+            len2 = vex_strlen(intbuf)+2;
+            if (fwidth > len2) { len1 = ljustify ? 0 : fwidth-len2;
+                                 len3 = ljustify ? fwidth-len2 : 0; }
+            PAD(len1); PUT('0'); PUT('x'); PUTSTR(intbuf); PAD(len3);
+            break;
+         }
+         case '%': {
+            PUT('%');
+            break;
+         }
+         default:
+            /* no idea what it is.  Print the format literally and
+               move on. */
+            while (saved_format <= format) {
+               PUT(*saved_format);
+               saved_format++;
+            }
+            break;
+      }
+
+      format++;
+
+   }
+
+   return nout;
+
+#  undef PUT
+#  undef PAD
+#  undef PUTSTR
+}
+
+
+/* A general replacement for printf().  Note that only low-level 
+   debugging info should be sent via here.  The official route is to
+   to use vg_message().  This interface is deprecated.
+*/
+static HChar myprintf_buf[1000];
+static Int   n_myprintf_buf;
+
+static void add_to_myprintf_buf ( HChar c )
+{
+   Bool emit = toBool(c == '\n' || n_myprintf_buf >= 1000-10 /*paranoia*/);
+   myprintf_buf[n_myprintf_buf++] = c;
+   myprintf_buf[n_myprintf_buf] = 0;
+   if (emit) {
+      (*vex_log_bytes)( myprintf_buf, vex_strlen(myprintf_buf) );
+      n_myprintf_buf = 0;
+      myprintf_buf[n_myprintf_buf] = 0;
+   }
+}
+
+UInt vex_printf ( HChar* format, ... )
+{
+   UInt ret;
+   va_list vargs;
+   va_start(vargs,format);
+   
+   n_myprintf_buf = 0;
+   myprintf_buf[n_myprintf_buf] = 0;      
+   ret = vprintf_wrk ( add_to_myprintf_buf, format, vargs );
+
+   if (n_myprintf_buf > 0) {
+      (*vex_log_bytes)( myprintf_buf, n_myprintf_buf );
+   }
+
+   va_end(vargs);
+
+   return ret;
+}
+
+
+/* A general replacement for sprintf(). */
+
+static HChar *vg_sprintf_ptr;
+
+static void add_to_vg_sprintf_buf ( HChar c )
+{
+   *vg_sprintf_ptr++ = c;
+}
+
+UInt vex_sprintf ( HChar* buf, HChar *format, ... )
+{
+   Int ret;
+   va_list vargs;
+
+   vg_sprintf_ptr = buf;
+
+   va_start(vargs,format);
+
+   ret = vprintf_wrk ( add_to_vg_sprintf_buf, format, vargs );
+   add_to_vg_sprintf_buf(0);
+
+   va_end(vargs);
+
+   vassert(vex_strlen(buf) == ret);
+   return ret;
+}
+
+
+/*---------------------------------------------------------------*/
+/*--- end                                         main_util.c ---*/
+/*---------------------------------------------------------------*/

diff --git a/VEX/priv/main_util.h b/VEX/priv/main_util.h
new file mode 100644
index 0000000..1392b4b
--- /dev/null
+++ b/VEX/priv/main_util.h

@@ -0,0 +1,101 @@
+
+/*---------------------------------------------------------------*/
+/*--- begin                                       main_util.h ---*/
+/*---------------------------------------------------------------*/
+
+/*
+   This file is part of Valgrind, a dynamic binary instrumentation
+   framework.
+
+   Copyright (C) 2004-2010 OpenWorks LLP
+      info@open-works.net
+
+   This program is free software; you can redistribute it and/or
+   modify it under the terms of the GNU General Public License as
+   published by the Free Software Foundation; either version 2 of the
+   License, or (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful, but
+   WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+   02110-1301, USA.
+
+   The GNU General Public License is contained in the file COPYING.
+
+   Neither the names of the U.S. Department of Energy nor the
+   University of California nor the names of its contributors may be
+   used to endorse or promote products derived from this software
+   without prior written permission.
+*/
+
+#ifndef __VEX_MAIN_UTIL_H
+#define __VEX_MAIN_UTIL_H
+
+#include "libvex_basictypes.h"
+
+
+/* Misc. */
+
+#define NULL ((void*)0)
+
+
+/* Stuff for panicking and assertion. */
+
+#define VG__STRING(__str)  #__str
+
+#define vassert(expr)                                           \
+  ((void) ((expr) ? 0 :                                         \
+           (vex_assert_fail (VG__STRING(expr),                  \
+                             __FILE__, __LINE__,                \
+                             __PRETTY_FUNCTION__), 0)))
+
+__attribute__ ((__noreturn__))
+extern void vex_assert_fail ( const HChar* expr, const HChar* file,
+                              Int line, const HChar* fn );
+__attribute__ ((__noreturn__))
+extern void vpanic ( HChar* str );
+
+
+/* Printing */
+
+__attribute__ ((format (printf, 1, 2)))
+extern UInt vex_printf ( HChar *format, ... );
+
+__attribute__ ((format (printf, 2, 3)))
+extern UInt vex_sprintf ( HChar* buf, HChar *format, ... );
+
+
+/* String ops */
+
+extern Bool vex_streq ( const HChar* s1, const HChar* s2 );
+
+
+/* Storage management: clear the area, and allocate from it. */
+
+/* By default allocation occurs in the temporary area.  However, it is
+   possible to switch to permanent area allocation if that's what you
+   want.  Permanent area allocation is very limited, tho. */
+
+typedef
+   enum {
+      VexAllocModeTEMP, 
+      VexAllocModePERM 
+   }
+   VexAllocMode;
+
+extern void         vexSetAllocMode ( VexAllocMode );
+extern VexAllocMode vexGetAllocMode ( void );
+extern void         vexAllocSanityCheck ( void );
+
+extern void vexSetAllocModeTEMP_and_clear ( void );
+
+#endif /* ndef __VEX_MAIN_UTIL_H */
+
+/*---------------------------------------------------------------*/
+/*---                                             main_util.h ---*/
+/*---------------------------------------------------------------*/
commit	ed07e00d438c74b7a23c01bfffde77e3968305e4	[log] [tgz]
author	Jeff Brown <jeffbrown@google.com>	Thu Feb 03 17:46:23 2011 -0800
committer	Jeff Brown <jeffbrown@google.com>	Mon Feb 07 15:59:37 2011 -0800
tree	94517892abd5499cffdb7259355f53e6d4912187
parent	0bdfcf2b8a71090760ac9f4914d4dbe00e5e4398 [diff]